From f7dfc9bbb8d0a0816ae6c8bd46b9d86ea419ff47 Mon Sep 17 00:00:00 2001
From: dgtlmoon <dgtlmoon@gmail.com>
Date: Thu, 9 Oct 2025 00:14:28 +0200
Subject: [PATCH] Refactor content type detection, fixing more xpath issues for
 RSS types (#3465)  #3462  #3391

---
 changedetectionio/processors/magic.py         | 138 ++++++++++++++++++
 .../processors/text_json_diff/processor.py    |  78 +++++-----
 changedetectionio/tests/test_backend.py       |   3 +
 changedetectionio/tests/test_group.py         |   2 -
 .../tests/test_history_consistency.py         |   3 +-
 .../tests/test_xpath_selector.py              |  82 ++++++++++-
 6 files changed, 253 insertions(+), 53 deletions(-)
 create mode 100644 changedetectionio/processors/magic.py
diff --git a/changedetectionio/processors/magic.py b/changedetectionio/processors/magic.py
new file mode 100644
index 00000000..bdfda2de
--- /dev/null
+++ b/changedetectionio/processors/magic.py
@@ -0,0 +1,138 @@
+"""
+Content Type Detection and Stream Classification
+
+This module provides intelligent content-type detection for changedetection.io.
+It addresses the common problem where HTTP Content-Type headers are missing, incorrect,
+or too generic, which would otherwise cause the wrong processor to be used.
+
+The guess_stream_type class combines:
+1. HTTP Content-Type headers (when available and reliable)
+2. Python-magic library for MIME detection (analyzing actual file content)
+3. Content-based pattern matching for text formats (HTML tags, XML declarations, etc.)
+
+This multi-layered approach ensures accurate detection of RSS feeds, JSON, HTML, PDF,
+plain text, CSV, YAML, and XML formats - even when servers provide misleading headers.
+
+Used by: processors/text_json_diff/processor.py and other content processors
+"""
+
+# When to apply the 'cdata to real HTML' hack
+RSS_XML_CONTENT_TYPES = [
+    "application/rss+xml",
+    "application/rdf+xml",
+    "text/xml",
+    "application/xml",
+    "application/atom+xml",
+    "text/rss+xml",  # rare, non-standard
+    "application/x-rss+xml",  # legacy (older feed software)
+    "application/x-atom+xml",  # legacy (older Atom)
+]
+
+# JSON Content-types
+JSON_CONTENT_TYPES = [
+    "application/activity+json",
+    "application/feed+json",
+    "application/json",
+    "application/ld+json",
+    "application/vnd.api+json",
+]
+
+# CSV Content-types
+CSV_CONTENT_TYPES = [
+    "text/csv",
+    "application/csv",
+]
+
+# Generic XML Content-types (non-RSS/Atom)
+XML_CONTENT_TYPES = [
+    "text/xml",
+    "application/xml",
+]
+
+# YAML Content-types
+YAML_CONTENT_TYPES = [
+    "text/yaml",
+    "text/x-yaml",
+    "application/yaml",
+    "application/x-yaml",
+]
+
+HTML_PATTERNS = ['<!doctype html', '<html', '<head', '<body', '<script', '<iframe', '<div']
+
+import re
+import magic
+from loguru import logger
+
+
+class guess_stream_type():
+    is_pdf = False
+    is_json = False
+    is_html = False
+    is_plaintext = False
+    is_rss = False
+    is_csv = False
+    is_xml = False  # Generic XML, not RSS/Atom
+    is_yaml = False
+
+    def __init__(self, http_content_header, content):
+
+        magic_content_header = http_content_header
+        test_content = content[:200].lower().strip()
+
+        # Remove whitespace between < and tag name for robust detection (handles '< html', '<\nhtml', etc.)
+        test_content_normalized = re.sub(r'<\s+', '<', test_content)
+
+        # Magic will sometimes call text/plain as text/html!
+        magic_result = None
+        try:
+            mime = magic.from_buffer(content[:200], mime=True) # Send the original content
+            logger.debug(f"Guessing mime type, original content_type '{http_content_header}', mime type detected '{mime}'")
+            if mime and "/" in mime:
+                magic_result = mime
+                # Ignore generic/fallback mime types from magic
+                if mime in ['application/octet-stream', 'application/x-empty', 'binary']:
+                    logger.debug(f"Ignoring generic mime type '{mime}' from magic library")
+                # Trust magic for non-text types immediately
+                elif mime not in ['text/html', 'text/plain']:
+                    magic_content_header = mime
+
+        except Exception as e:
+            logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}), using content-based detection")
+
+        # Content-based detection (most reliable for text formats)
+        # Check for HTML patterns first - if found, override magic's text/plain
+        has_html_patterns = any(p in test_content_normalized for p in HTML_PATTERNS)
+
+        # Always trust headers first
+        if any(s in http_content_header for s in RSS_XML_CONTENT_TYPES) or any(s in magic_content_header for s in RSS_XML_CONTENT_TYPES):
+            self.is_rss = True
+        elif any(s in http_content_header for s in JSON_CONTENT_TYPES) or any(s in magic_content_header for s in JSON_CONTENT_TYPES):
+            self.is_json = True
+        elif any(s in http_content_header for s in CSV_CONTENT_TYPES) or any(s in magic_content_header for s in CSV_CONTENT_TYPES):
+            self.is_csv = True
+        elif any(s in http_content_header for s in XML_CONTENT_TYPES) or any(s in magic_content_header for s in XML_CONTENT_TYPES):
+            # Only mark as generic XML if not already detected as RSS
+            if not self.is_rss:
+                self.is_xml = True
+        elif any(s in http_content_header for s in YAML_CONTENT_TYPES) or any(s in magic_content_header for s in YAML_CONTENT_TYPES):
+            self.is_yaml = True
+        elif 'pdf' in magic_content_header:
+            self.is_pdf = True
+###
+        elif has_html_patterns or http_content_header == 'text/html':
+            self.is_html = True
+        # If magic says text/plain and we found no HTML patterns, trust it
+        elif magic_result == 'text/plain':
+            self.is_plaintext = True
+            logger.debug(f"Trusting magic's text/plain result (no HTML patterns detected)")
+        elif '<rss' in test_content_normalized or '<feed' in test_content_normalized:
+            self.is_rss = True
+        elif test_content_normalized.startswith('<?xml'):
+            # Generic XML that's not RSS/Atom (RSS/Atom checked above)
+            self.is_xml = True
+        elif '%pdf-1' in test_content:
+            self.is_pdf = True
+        # Only trust magic for 'text' if no other patterns matched
+        elif 'text' in magic_content_header:
+            self.is_plaintext = True
+
diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py
index 45d64421..6b23b408 100644
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -13,6 +13,8 @@ from changedetectionio import html_tools, content_fetchers
 from changedetectionio.blueprint.price_data_follower import PRICE_DATA_TRACK_ACCEPT, PRICE_DATA_TRACK_REJECT
 from loguru import logger
 
+from changedetectionio.processors.magic import guess_stream_type
+
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
 name = 'Webpage Text/HTML, JSON and PDF changes'
@@ -20,6 +22,9 @@ description = 'Detects all text changes where possible'
 
 json_filter_prefixes = ['json:', 'jq:', 'jqraw:']
 
+# Assume it's this type if the server says nothing on content-type
+DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER = 'text/html'
+
 class FilterNotFoundInResponse(ValueError):
     def __init__(self, msg, screenshot=None, xpath_data=None):
         self.screenshot = screenshot
@@ -45,6 +50,9 @@ class perform_site_check(difference_detection_processor):
         if not watch:
             raise Exception("Watch no longer exists.")
 
+        ctype_header = self.fetcher.get_all_headers().get('content-type', DEFAULT_WHEN_NO_CONTENT_TYPE_HEADER).lower()
+        stream_content_type = guess_stream_type(http_content_header=ctype_header, content=self.fetcher.content)
+
         # Unset any existing notification error
         update_obj = {'last_notification_error': False, 'last_error': False}
 
@@ -54,7 +62,7 @@ class perform_site_check(difference_detection_processor):
         self.xpath_data = self.fetcher.xpath_data
 
         # Track the content type
-        update_obj['content_type'] = self.fetcher.get_all_headers().get('content-type', '').lower()
+        update_obj['content_type'] = ctype_header
 
         # Watches added automatically in the queue manager will skip if its the same checksum as the previous run
         # Saves a lot of CPU
@@ -69,24 +77,12 @@ class perform_site_check(difference_detection_processor):
         # https://stackoverflow.com/questions/41817578/basic-method-chaining ?
         # return content().textfilter().jsonextract().checksumcompare() ?
 
-        is_json = 'application/json' in self.fetcher.get_all_headers().get('content-type', '').lower()
-        is_html = not is_json
-        is_rss = False
 
-        ctype_header = self.fetcher.get_all_headers().get('content-type', '').lower()
         # Go into RSS preprocess for converting CDATA/comment to usable text
-        if any(substring in ctype_header for substring in ['application/xml', 'application/rss', 'text/xml']):
-            if '<rss' in self.fetcher.content[:100].lower():
-                self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
-                is_rss = True
+        if stream_content_type.is_rss:
+            self.fetcher.content = cdata_in_document_to_text(html_content=self.fetcher.content)
 
-        # source: support, basically treat it as plaintext
-        if watch.is_source_type_url:
-            is_html = False
-            is_json = False
-
-        inline_pdf = self.fetcher.get_all_headers().get('content-disposition', '') and '%PDF-1' in self.fetcher.content[:10]
-        if watch.is_pdf or 'application/pdf' in self.fetcher.get_all_headers().get('content-type', '').lower() or inline_pdf:
+        if watch.is_pdf or stream_content_type.is_pdf:
             from shutil import which
             tool = os.getenv("PDF_TO_HTML_TOOL", "pdftohtml")
             if not which(tool):
@@ -130,11 +126,12 @@ class perform_site_check(difference_detection_processor):
         has_filter_rule = len(include_filters_rule) and len(include_filters_rule[0].strip())
         has_subtractive_selectors = len(subtractive_selectors) and len(subtractive_selectors[0].strip())
 
-        if is_json and not has_filter_rule:
-            include_filters_rule.append("json:$")
-            has_filter_rule = True
+        if stream_content_type.is_json:
+            if not has_filter_rule:
+                # Force a reformat
+                include_filters_rule.append("json:$")
+                has_filter_rule = True
 
-        if is_json:
             # Sort the JSON so we dont get false alerts when the content is just re-ordered
             try:
                 self.fetcher.content = json.dumps(json.loads(self.fetcher.content), sort_keys=True)
@@ -142,34 +139,25 @@ class perform_site_check(difference_detection_processor):
                 # Might have just been a snippet, or otherwise bad JSON, continue
                 pass
 
-        if has_filter_rule:
-            for filter in include_filters_rule:
-                if any(prefix in filter for prefix in json_filter_prefixes):
-                    stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
-                    is_html = False
+            if has_filter_rule:
+                for filter in include_filters_rule:
+                    if any(prefix in filter for prefix in json_filter_prefixes):
+                        stripped_text_from_html += html_tools.extract_json_as_string(content=self.fetcher.content, json_filter=filter)
+                        if stripped_text_from_html:
+                            stream_content_type.is_json = True
+                            stream_content_type.is_html = False
 
-        if is_html or watch.is_source_type_url:
+        # We have 'watch.is_source_type_url' because we should be able to use selectors on the raw HTML but return just that selected HTML
+        if stream_content_type.is_html or watch.is_source_type_url or stream_content_type.is_plaintext or stream_content_type.is_rss or stream_content_type.is_xml or stream_content_type.is_pdf:
 
             # CSS Filter, extract the HTML that matches and feed that into the existing inscriptis::get_text
             self.fetcher.content = html_tools.workarounds_for_obfuscations(self.fetcher.content)
             html_content = self.fetcher.content
-            content_type = self.fetcher.get_all_headers().get('content-type', '').lower()
-            is_attachment = 'attachment' in self.fetcher.get_all_headers().get('content-disposition', '').lower() or 'octet-stream' in content_type
 
-            # Try to detect better mime types if its a download or not announced as HTML
-            if is_attachment:
-                logger.debug(f"Got a reply that may be a download or possibly a text attachment, checking..")
-                try:
-                    import magic
-                    mime = magic.from_buffer(html_content, mime=True)
-                    logger.debug(f"Guessing mime type, original content_type '{content_type}', mime type detected '{mime}'")
-                    if mime and "/" in mime: # looks valid and is a valid mime type
-                        content_type = mime
-                except Exception as e:
-                    logger.error(f"Error getting a more precise mime type from 'magic' library ({str(e)}")
-
-            if 'text/' in content_type and not 'html' in content_type:
+            # Some kind of "text" but definitely not RSS looking
+            if stream_content_type.is_plaintext:
                 # Don't run get_text or xpath/css filters on plaintext
+                # We are not HTML, we are not any kind of RSS, doesnt even look like HTML
                 stripped_text_from_html = html_content
             else:
                 # If not JSON, and if it's not text/plain..
@@ -186,13 +174,13 @@ class perform_site_check(difference_detection_processor):
                             html_content += html_tools.xpath_filter(xpath_filter=filter_rule.replace('xpath:', ''),
                                                                     html_content=self.fetcher.content,
                                                                     append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                    is_rss=is_rss)
+                                                                    is_rss=stream_content_type.is_rss)
 
                         elif filter_rule.startswith('xpath1:'):
                             html_content += html_tools.xpath1_filter(xpath_filter=filter_rule.replace('xpath1:', ''),
                                                                      html_content=self.fetcher.content,
                                                                      append_pretty_line_formatting=not watch.is_source_type_url,
-                                                                     is_rss=is_rss)
+                                                                     is_rss=stream_content_type.is_rss)
                         else:
                             html_content += html_tools.include_filters(include_filters=filter_rule,
                                                                        html_content=self.fetcher.content,
@@ -211,7 +199,7 @@ class perform_site_check(difference_detection_processor):
                     do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
                     stripped_text_from_html = html_tools.html_to_text(html_content=html_content,
                                                                       render_anchor_tag_content=do_anchor,
-                                                                      is_rss=is_rss)  # 1874 activate the <title workaround hack
+                                                                      is_rss=stream_content_type.is_rss)  # 1874 activate the <title workaround hack
 
         if watch.get('trim_text_whitespace'):
             stripped_text_from_html = '\n'.join(line.strip() for line in stripped_text_from_html.replace("\n\n", "\n").splitlines())
@@ -250,7 +238,7 @@ class perform_site_check(difference_detection_processor):
 
         # Treat pages with no renderable text content as a change? No by default
         empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
-        if not is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
+        if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text_from_html.strip()) == 0:
             raise content_fetchers.exceptions.ReplyWithContentButNoText(url=url,
                                                             status_code=self.fetcher.get_last_status_code(),
                                                             screenshot=self.fetcher.screenshot,
diff --git a/changedetectionio/tests/test_backend.py b/changedetectionio/tests/test_backend.py
index 1447e7bf..7d0a311a 100644
--- a/changedetectionio/tests/test_backend.py
+++ b/changedetectionio/tests/test_backend.py
@@ -174,6 +174,8 @@ def test_non_text_mime_or_downloads(client, live_server, measure_memory_usage):
     but once the server sends content-type: application/octet-stream (which is usually done to force the browser to show the Download dialog),
     changedetection somehow ignores all line breaks and treats the document file as if everything is on one line.
 
+    WHAT THIS DOES - makes the system rely on 'magic' to determine what is it
+
     :param client:
     :param live_server:
     :param measure_memory_usage:
@@ -271,6 +273,7 @@ got it\r\n
         url_for("ui.ui_views.preview_page", uuid="first"),
         follow_redirects=True
     )
+
     assert b"some random text that should be split by line\n" in res.data
     ####
 
diff --git a/changedetectionio/tests/test_group.py b/changedetectionio/tests/test_group.py
index 5e2596c5..e63639a9 100644
--- a/changedetectionio/tests/test_group.py
+++ b/changedetectionio/tests/test_group.py
@@ -264,8 +264,6 @@ def test_limit_tag_ui(client, live_server, measure_memory_usage):
     client.get(url_for('ui.mark_all_viewed', tag=tag_uuid), follow_redirects=True)
     wait_for_all_checks(client)
 
-    with open('/tmp/fuck.html', 'wb') as f:
-        f.write(res.data)
     # Should be only 1 unviewed
     res = client.get(url_for("watchlist.index"))
     assert res.data.count(b' unviewed ') == 1
diff --git a/changedetectionio/tests/test_history_consistency.py b/changedetectionio/tests/test_history_consistency.py
index a16e99f7..b8a21cf2 100644
--- a/changedetectionio/tests/test_history_consistency.py
+++ b/changedetectionio/tests/test_history_consistency.py
@@ -3,9 +3,8 @@
 import time
 import os
 import json
-import logging
 from flask import url_for
-from .util import live_server_setup, wait_for_all_checks
+from .util import wait_for_all_checks
 from urllib.parse import urlparse, parse_qs
 
 def test_consistent_history(client, live_server, measure_memory_usage):
diff --git a/changedetectionio/tests/test_xpath_selector.py b/changedetectionio/tests/test_xpath_selector.py
index d79fa4b3..abcc766a 100644
--- a/changedetectionio/tests/test_xpath_selector.py
+++ b/changedetectionio/tests/test_xpath_selector.py
@@ -1,12 +1,42 @@
 # -*- coding: utf-8 -*-
 
-import time
+
 from flask import url_for
-from .util import live_server_setup, wait_for_all_checks
-
-from ..html_tools import *
+from .util import  wait_for_all_checks
+from ..processors.magic import RSS_XML_CONTENT_TYPES
 
 
+def set_rss_atom_feed_response(header=''):
+    test_return_data = f"""{header}<!-- Generated on Wed, 08 Oct 2025 08:42:33 -0700, really really honestly  -->
+<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
+<channel>
+    <atom:link href="https://store.waterpowered.com/news/collection//" rel="self" type="application/rss+xml"/>
+    <title>RSS Feed</title>
+    <link>
+        <![CDATA[ https://store.waterpowered.com/news/collection// ]]>
+    </link>
+    <description>
+        <![CDATA[ Events and Announcements for ]]>
+    </description>
+    <language>en-us</language>
+    <generator>water News RSS</generator>
+    <item>
+        <title> 🍁 Lets go discount</title>
+        <description><p class="bb_paragraph">ok heres the description</p></description>
+        <link>
+        <![CDATA[ https://store.waterpowered.com/news/app/1643320/view/511845698831908921 ]]>
+        </link>
+        <pubDate>Wed, 08 Oct 2025 15:28:55 +0000</pubDate>
+        <guid isPermaLink="true">https://store.waterpowered.com/news/app/1643320/view/511845698831908921</guid>
+        <enclosure url="https://clan.fastly.waterstatic.com/images/40721482/42822e5f00b2becf520ace9500981bb56f3a89f2.jpg" length="0" type="image/jpeg"/>
+    </item>
+</channel>
+</rss>"""
+
+    with open("test-datastore/endpoint-content.txt", "w") as f:
+        f.write(test_return_data)
+
+    return None
 
 
 
@@ -575,3 +605,47 @@ def test_xpath_20_function_string_join_matches(client, live_server, measure_memo
 
     client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
 
+
+def _subtest_xpath_rss(client, content_type='text/html'):
+
+    # Add our URL to the import page
+    test_url = url_for('test_endpoint', content_type=content_type, _external=True)
+    res = client.post(
+        url_for("ui.ui_views.form_quick_watch_add"),
+        data={"url": test_url, "tags": '', 'edit_and_watch_submit_button': 'Edit > Watch'},
+        follow_redirects=True
+    )
+
+    assert b"Watch added in Paused state, saving will unpause" in res.data
+
+    res = client.post(
+        url_for("ui.ui_edit.edit_page", uuid="first", unpause_on_save=1),
+        data={
+            "url": test_url,
+            "include_filters": "xpath://item",
+            "tags": '',
+            "fetch_backend": "html_requests",
+            "time_between_check_use_default": "y",
+        },
+        follow_redirects=True
+    )
+
+    assert b"unpaused" in res.data
+    wait_for_all_checks(client)
+
+    res = client.get(
+        url_for("ui.ui_views.preview_page", uuid="first"),
+        follow_redirects=True
+    )
+
+    assert b"Lets go discount" in res.data, f"When testing for Lets go discount called with content type '{content_type}'"
+    assert b"Events and Announcements" not in res.data, f"When testing for Lets go discount called with content type '{content_type}'" # It should not be here because thats not our selector target
+
+    client.get(url_for("ui.form_delete", uuid="all"), follow_redirects=True)
+
+# Be sure all-in-the-wild types of RSS feeds work with xpath
+def test_rss_xpath(client, live_server):
+    for feed_header in ['', '<?xml version="1.0" encoding="utf-8"?>']:
+        set_rss_atom_feed_response(header=feed_header)
+        for content_type in RSS_XML_CONTENT_TYPES:
+            _subtest_xpath_rss(client, content_type=content_type)