From f7dfc9bbb8d0a0816ae6c8bd46b9d86ea419ff47 Mon Sep 17 00:00:00 2001 From: dgtlmoon Date: Thu, 9 Oct 2025 00:14:28 +0200 Subject: [PATCH] Refactor content type detection, fixing more xpath issues for RSS types (#3465) #3462 #3391 --- changedetectionio/processors/magic.py | 138 ++++++++++++++++++ .../processors/text_json_diff/processor.py | 78 +++++----- changedetectionio/tests/test_backend.py | 3 + changedetectionio/tests/test_group.py | 2 - .../tests/test_history_consistency.py | 3 +- .../tests/test_xpath_selector.py | 82 ++++++++++- 6 files changed, 253 insertions(+), 53 deletions(-) create mode 100644 changedetectionio/processors/magic.py diff --git a/changedetectionio/processors/magic.py b/changedetectionio/processors/magic.py new file mode 100644 index 00000000..bdfda2de --- /dev/null +++ b/changedetectionio/processors/magic.py @@ -0,0 +1,138 @@ +""" +Content Type Detection and Stream Classification + +This module provides intelligent content-type detection for changedetection.io. +It addresses the common problem where HTTP Content-Type headers are missing, incorrect, +or too generic, which would otherwise cause the wrong processor to be used. + +The guess_stream_type class combines: +1. HTTP Content-Type headers (when available and reliable) +2. Python-magic library for MIME detection (analyzing actual file content) +3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.) + +This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF, +plain text, CSV, YAML, and XML formats - even when servers provide misleading headers. + +Used by: processors/text_json_diff/processor.py and other content processors +""" + +# When to apply the 'cdata to real HTML' hack +RSS_XML_CONTENT_TYPES = [ + "application/rss+xml", + "application/rdf+xml", + "text/xml", + "application/xml", + "application/atom+xml", + "text/rss+xml", # rare, non-standard + "application/x-rss+xml", # legacy (older feed software) + "application/x-atom+xml", # legacy (older Atom) +] + +# JSON Content-types +JSON_CONTENT_TYPES = [ + "application/activity+json", + "application/feed+json", + "application/json", + "application/ld+json", + "application/vnd.api+json", +] + +# CSV Content-types +CSV_CONTENT_TYPES = [ + "text/csv", + "application/csv", +] + +# Generic XML Content-types (non-RSS/Atom) +XML_CONTENT_TYPES = [ + "text/xml", + "application/xml", +] + +# YAML Content-types +YAML_CONTENT_TYPES = [ + "text/yaml", + "text/x-yaml", + "application/yaml", + "application/x-yaml", +] + +HTML_PATTERNS = [' + + + + RSS Feed + + + + + + + en-us + water News RSS + + 🍁 Lets go discount +

ok heres the description

+ + + + Wed, 08 Oct 2025 15:28:55 +0000 + https://store.waterpowered.com/news/app/1643320/view/511845698831908921 + +
+
+
""" + + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(test_return_data) + + return None @@ -575,3 +605,47 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True) + +def _subtest_xpath_rss(client, content_type='text/html'): + + # Add our URL to the import page + test_url = url_for('test_endpoint', content_type=content_type, _external=True) + res = client.post( + url_for("ui.ui_views.form_quick_watch_add"), + data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'}, + follow_redirects=True + ) + + assert b"Watch added in Paused state, saving will unpause" in res.data + + res = client.post( + url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1), + data={ + "url": test_url, + "include_filters": "xpath://item", + "tags": '', + "fetch_backend": "html_requests", + "time_between_check_use_default": "y", + }, + follow_redirects=True + ) + + assert b"unpaused" in res.data + wait_for_all_checks(client) + + res = client.get( + url_for("ui.ui_views.preview_page", uuid="first"), + follow_redirects=True + ) + + assert b"Lets go discount" in res.data, f"When testing for Lets go discount called with content type '{content_type}'" + assert b"Events and Announcements" not in res.data, f"When testing for Lets go discount called with content type '{content_type}'" # It should not be here because thats not our selector target + + client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True) + +# Be sure all-in-the-wild types of RSS feeds work with xpath +def test_rss_xpath(client, live_server): + for feed_header in ['', '']: + set_rss_atom_feed_response(header=feed_header) + for content_type in RSS_XML_CONTENT_TYPES: + _subtest_xpath_rss(client, content_type=content_type)