changedetection.io/changedetectionio/html_tools.py

from loguru import logger
from lxml import etree
from typing import List
import json
import re

# HTML added to be sure each result matching a filter (.example) gets converted to a new line by Inscriptis
TEXT_FILTER_LIST_LINE_SUFFIX = "<br>"
TRANSLATE_WHITESPACE_TABLE = str.maketrans('', '', '\r\n\t ')
PERL_STYLE_REGEX = r'^/(.*?)/([a-z]*)?$'

# 'price' , 'lowPrice', 'highPrice' are usually under here
# All of those may or may not appear on different websites - I didnt find a way todo case-insensitive searching here
LD_JSON_PRODUCT_OFFER_SELECTORS = ["json:$..offers", "json:$..Offers"]

class JSONNotFound(ValueError):
    def __init__(self, msg):
        ValueError.__init__(self, msg)


# Doesn't look like python supports forward slash auto enclosure in re.findall
# So convert it to inline flag "(?i)foobar" type configuration
def perl_style_slash_enclosed_regex_to_options(regex):

    res = re.search(PERL_STYLE_REGEX, regex, re.IGNORECASE)

    if res:
        flags = res.group(2) if res.group(2) else 'i'
        regex = f"(?{flags}){res.group(1)}"
    else:
        # Fall back to just ignorecase as an option
        regex = f"(?i){regex}"

    return regex

# Given a CSS Rule, and a blob of HTML, return the blob of HTML that matches
def include_filters(include_filters, html_content, append_pretty_line_formatting=False):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    html_block = ""
    r = soup.select(include_filters, separator="")

    for element in r:
        # When there's more than 1 match, then add the suffix to separate each line
        # And where the matched result doesn't include something that will cause Inscriptis to add a newline
        # (This way each 'match' reliably has a new-line in the diff)
        # Divs are converted to 4 whitespaces by inscriptis
        if append_pretty_line_formatting and len(html_block) and not element.name in (['br', 'hr', 'div', 'p']):
            html_block += TEXT_FILTER_LIST_LINE_SUFFIX

        html_block += str(element)

    return html_block

def subtractive_css_selector(css_selector, html_content):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    # So that the elements dont shift their index, build a list of elements here which will be pointers to their place in the DOM
    elements_to_remove = soup.select(css_selector)

    # Then, remove them in a separate loop
    for item in elements_to_remove:
        item.decompose()

    return str(soup)

def subtractive_xpath_selector(selectors: List[str], html_content: str) -> str:
    # Parse the HTML content using lxml
    html_tree = etree.HTML(html_content)

    # First, collect all elements to remove
    elements_to_remove = []

    # Iterate over the list of XPath selectors
    for selector in selectors:
        # Collect elements for each selector
        elements_to_remove.extend(html_tree.xpath(selector))

    # Then, remove them in a separate loop
    for element in elements_to_remove:
        if element.getparent() is not None:  # Ensure the element has a parent before removing
            element.getparent().remove(element)

    # Convert the modified HTML tree back to a string
    modified_html = etree.tostring(html_tree, method="html").decode("utf-8")
    return modified_html


def element_removal(selectors: List[str], html_content):
    """Removes elements that match a list of CSS or XPath selectors."""
    modified_html = html_content
    css_selectors = []
    xpath_selectors = []

    for selector in selectors:
        if selector.startswith(('xpath:', 'xpath1:', '//')):
            # Handle XPath selectors separately
            xpath_selector = selector.removeprefix('xpath:').removeprefix('xpath1:')
            xpath_selectors.append(xpath_selector)
        else:
            # Collect CSS selectors as one "hit", see comment in subtractive_css_selector
            css_selectors.append(selector.strip().strip(","))

    if xpath_selectors:
        modified_html = subtractive_xpath_selector(xpath_selectors, modified_html)

    if css_selectors:
        # Remove duplicates, then combine all CSS selectors into one string, separated by commas
        # This stops the elements index shifting
        unique_selectors = list(set(css_selectors))  # Ensure uniqueness
        combined_css_selector = " , ".join(unique_selectors)
        modified_html = subtractive_css_selector(combined_css_selector, modified_html)


    return modified_html

def elementpath_tostring(obj):
    """
    change elementpath.select results to string type
    # The MIT License (MIT), Copyright (c), 2018-2021, SISSA (Scuola Internazionale Superiore di Studi Avanzati)
    # https://github.com/sissaschool/elementpath/blob/dfcc2fd3d6011b16e02bf30459a7924f547b47d0/elementpath/xpath_tokens.py#L1038
    """

    import elementpath
    from decimal import Decimal
    import math

    if obj is None:
        return ''
    # https://elementpath.readthedocs.io/en/latest/xpath_api.html#elementpath.select
    elif isinstance(obj, elementpath.XPathNode):
        return obj.string_value
    elif isinstance(obj, bool):
        return 'true' if obj else 'false'
    elif isinstance(obj, Decimal):
        value = format(obj, 'f')
        if '.' in value:
            return value.rstrip('0').rstrip('.')
        return value

    elif isinstance(obj, float):
        if math.isnan(obj):
            return 'NaN'
        elif math.isinf(obj):
            return str(obj).upper()

        value = str(obj)
        if '.' in value:
            value = value.rstrip('0').rstrip('.')
        if '+' in value:
            value = value.replace('+', '')
        if 'e' in value:
            return value.upper()
        return value

    return str(obj)

# Return str Utf-8 of matched rules
def xpath_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
    from lxml import etree, html
    import elementpath
    # xpath 2.0-3.1
    from elementpath.xpath3 import XPath3Parser

    parser = etree.HTMLParser()
    if is_rss:
        # So that we can keep CDATA for cdata_in_document_to_text() to process
        parser = etree.XMLParser(strip_cdata=False)

    tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
    html_block = ""

    r = elementpath.select(tree, xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'}, parser=XPath3Parser)
    #@note: //title/text() wont work where <title>CDATA..

    if type(r) != list:
        r = [r]

    for element in r:
        # When there's more than 1 match, then add the suffix to separate each line
        # And where the matched result doesn't include something that will cause Inscriptis to add a newline
        # (This way each 'match' reliably has a new-line in the diff)
        # Divs are converted to 4 whitespaces by inscriptis
        if append_pretty_line_formatting and len(html_block) and (not hasattr( element, 'tag' ) or not element.tag in (['br', 'hr', 'div', 'p'])):
            html_block += TEXT_FILTER_LIST_LINE_SUFFIX

        if type(element) == str:
            html_block += element
        elif issubclass(type(element), etree._Element) or issubclass(type(element), etree._ElementTree):
            html_block += etree.tostring(element, pretty_print=True).decode('utf-8')
        else:
            html_block += elementpath_tostring(element)

    return html_block

# Return str Utf-8 of matched rules
# 'xpath1:'
def xpath1_filter(xpath_filter, html_content, append_pretty_line_formatting=False, is_rss=False):
    from lxml import etree, html

    parser = None
    if is_rss:
        # So that we can keep CDATA for cdata_in_document_to_text() to process
        parser = etree.XMLParser(strip_cdata=False)

    tree = html.fromstring(bytes(html_content, encoding='utf-8'), parser=parser)
    html_block = ""

    r = tree.xpath(xpath_filter.strip(), namespaces={'re': 'http://exslt.org/regular-expressions'})
    #@note: //title/text() wont work where <title>CDATA..

    for element in r:
        # When there's more than 1 match, then add the suffix to separate each line
        # And where the matched result doesn't include something that will cause Inscriptis to add a newline
        # (This way each 'match' reliably has a new-line in the diff)
        # Divs are converted to 4 whitespaces by inscriptis
        if append_pretty_line_formatting and len(html_block) and (not hasattr(element, 'tag') or not element.tag in (['br', 'hr', 'div', 'p'])):
            html_block += TEXT_FILTER_LIST_LINE_SUFFIX

        # Some kind of text, UTF-8 or other
        if isinstance(element, (str, bytes)):
            html_block += element
        else:
            # Return the HTML which will get parsed as text
            html_block += etree.tostring(element, pretty_print=True).decode('utf-8')

    return html_block

# Extract/find element
def extract_element(find='title', html_content=''):
    from bs4 import BeautifulSoup

    #Re #106, be sure to handle when its not found
    element_text = None

    soup = BeautifulSoup(html_content, 'html.parser')
    result = soup.find(find)
    if result and result.string:
        element_text = result.string.strip()

    return element_text

#
def _parse_json(json_data, json_filter):
    from jsonpath_ng.ext import parse

    if json_filter.startswith("json:"):
        jsonpath_expression = parse(json_filter.replace('json:', ''))
        match = jsonpath_expression.find(json_data)
        return _get_stripped_text_from_json_match(match)

    if json_filter.startswith("jq:") or json_filter.startswith("jqraw:"):

        try:
            import jq
        except ModuleNotFoundError:
            # `jq` requires full compilation in windows and so isn't generally available
            raise Exception("jq not support not found")

        if json_filter.startswith("jq:"):
            jq_expression = jq.compile(json_filter.removeprefix("jq:"))
            match = jq_expression.input(json_data).all()
            return _get_stripped_text_from_json_match(match)

        if json_filter.startswith("jqraw:"):
            jq_expression = jq.compile(json_filter.removeprefix("jqraw:"))
            match = jq_expression.input(json_data).all()
            return '\n'.join(str(item) for item in match)

def _get_stripped_text_from_json_match(match):
    s = []
    # More than one result, we will return it as a JSON list.
    if len(match) > 1:
        for i in match:
            s.append(i.value if hasattr(i, 'value') else i)

    # Single value, use just the value, as it could be later used in a token in notifications.
    if len(match) == 1:
        s = match[0].value if hasattr(match[0], 'value') else match[0]

    # Re #257 - Better handling where it does not exist, in the case the original 's' value was False..
    if not match:
        # Re 265 - Just return an empty string when filter not found
        return ''

    # Ticket #462 - allow the original encoding through, usually it's UTF-8 or similar
    stripped_text_from_html = json.dumps(s, indent=4, ensure_ascii=False)

    return stripped_text_from_html

# content - json
# json_filter - ie json:$..price
# ensure_is_ldjson_info_type - str "product", optional, "@type == product" (I dont know how to do that as a json selector)
def extract_json_as_string(content, json_filter, ensure_is_ldjson_info_type=None):
    from bs4 import BeautifulSoup

    stripped_text_from_html = False
# https://github.com/dgtlmoon/changedetection.io/pull/2041#issuecomment-1848397161w
    # Try to parse/filter out the JSON, if we get some parser error, then maybe it's embedded within HTML tags
    try:
        # .lstrip("\ufeff") strings ByteOrderMark from UTF8 and still lets the UTF work
        stripped_text_from_html = _parse_json(json.loads(content.lstrip("\ufeff") ), json_filter)
    except json.JSONDecodeError as e:
        logger.warning(str(e))

        # Foreach <script json></script> blob.. just return the first that matches json_filter
        # As a last resort, try to parse the whole <body>
        soup = BeautifulSoup(content, 'html.parser')

        if ensure_is_ldjson_info_type:
            bs_result = soup.findAll('script', {"type": "application/ld+json"})
        else:
            bs_result = soup.findAll('script')
        bs_result += soup.findAll('body')

        bs_jsons = []
        for result in bs_result:
            # Skip empty tags, and things that dont even look like JSON
            if not result.text or '{' not in result.text:
                continue
            try:
                json_data = json.loads(result.text)
                bs_jsons.append(json_data)
            except json.JSONDecodeError:
                # Skip objects which cannot be parsed
                continue

        if not bs_jsons:
            raise JSONNotFound("No parsable JSON found in this document")

        for json_data in bs_jsons:
            stripped_text_from_html = _parse_json(json_data, json_filter)

            if ensure_is_ldjson_info_type:
                # Could sometimes be list, string or something else random
                if isinstance(json_data, dict):
                    # If it has LD JSON 'key' @type, and @type is 'product', and something was found for the search
                    # (Some sites have multiple of the same ld+json @type='product', but some have the review part, some have the 'price' part)
                    # @type could also be a list although non-standard ("@type": ["Product", "SubType"],)
                    # LD_JSON auto-extract also requires some content PLUS the ldjson to be present
                    # 1833 - could be either str or dict, should not be anything else

                    t = json_data.get('@type')
                    if t and stripped_text_from_html:

                        if isinstance(t, str) and t.lower() == ensure_is_ldjson_info_type.lower():
                            break
                        # The non-standard part, some have a list
                        elif isinstance(t, list):
                            if ensure_is_ldjson_info_type.lower() in [x.lower().strip() for x in t]:
                                break

            elif stripped_text_from_html:
                break

    if not stripped_text_from_html:
        # Re 265 - Just return an empty string when filter not found
        return ''

    return stripped_text_from_html

# Mode     - "content" return the content without the matches (default)
#          - "line numbers" return a list of line numbers that match (int list)
#
# wordlist - list of regex's (str) or words (str)
# Preserves all linefeeds and other whitespacing, its not the job of this to remove that
def strip_ignore_text(content, wordlist, mode="content"):
    ignore_text = []
    ignore_regex = []
    ignore_regex_multiline = []
    ignored_lines = []

    for k in wordlist:
        # Is it a regex?
        res = re.search(PERL_STYLE_REGEX, k, re.IGNORECASE)
        if res:
            res = re.compile(perl_style_slash_enclosed_regex_to_options(k))
            if res.flags & re.DOTALL or res.flags & re.MULTILINE:
                ignore_regex_multiline.append(res)
            else:
                ignore_regex.append(res)
        else:
            ignore_text.append(k.strip())

    for r in ignore_regex_multiline:
        for match in r.finditer(content):
            content_lines = content[:match.end()].splitlines(keepends=True)
            match_lines = content[match.start():match.end()].splitlines(keepends=True)

            end_line = len(content_lines)
            start_line = end_line - len(match_lines)

            if end_line - start_line <= 1:
                # Match is empty or in the middle of the line
                ignored_lines.append(start_line)
            else:
                for i in range(start_line, end_line):
                    ignored_lines.append(i)

    line_index = 0
    lines = content.splitlines(keepends=True)
    for line in lines:
        # Always ignore blank lines in this mode. (when this function gets called)
        got_match = False
        for l in ignore_text:
            if l.lower() in line.lower():
                got_match = True

        if not got_match:
            for r in ignore_regex:
                if r.search(line):
                    got_match = True

        if got_match:
            ignored_lines.append(line_index)

        line_index += 1

    ignored_lines = set([i for i in ignored_lines if i >= 0 and i < len(lines)])

    # Used for finding out what to highlight
    if mode == "line numbers":
        return [i + 1 for i in ignored_lines]

    output_lines = set(range(len(lines))) - ignored_lines
    return ''.join([lines[i] for i in output_lines])

def cdata_in_document_to_text(html_content: str, render_anchor_tag_content=False) -> str:
    from xml.sax.saxutils import escape as xml_escape
    pattern = '<!\[CDATA\[(\s*(?:.(?<!\]\]>)\s*)*)\]\]>'
    def repl(m):
        text = m.group(1)
        return xml_escape(html_to_text(html_content=text)).strip()

    return re.sub(pattern, repl, html_content)


# NOTE!! ANYTHING LIBXML, HTML5LIB ETC WILL CAUSE SOME SMALL MEMORY LEAK IN THE LOCAL "LIB" IMPLEMENTATION OUTSIDE PYTHON
import os

def html_to_text_sub_worker(temp_file_path, html_content, render_anchor_tag_content=False, is_rss=False):
    from inscriptis import get_text
    from inscriptis.model.config import ParserConfig
    try:
        if render_anchor_tag_content:
            parser_config = ParserConfig(
                annotation_rules={"a": ["hyperlink"]},
                display_links=True
            )
        else:
            parser_config = None

        if is_rss:
            html_content = re.sub(r'<title([\s>])', r'<h1\1', html_content)
            html_content = re.sub(r'</title>', r'</h1>', html_content)

        text_content = get_text(html_content, config=parser_config)

        with open(temp_file_path, "w", encoding="utf-8") as f:
            f.write(text_content)

    except Exception as e:
        # Write error to file so the parent can read it
        with open(temp_file_path, "w", encoding="utf-8") as f:
            f.write(f"[ERROR] {e}")

import tempfile
from multiprocessing import Process
def html_to_text(html_content: str, render_anchor_tag_content=False, is_rss=False, timeout=10) -> str:


    with tempfile.NamedTemporaryFile(delete=False, mode="w+", encoding="utf-8") as tmp_file:
        temp_file_path = tmp_file.name

    p = Process(
        target=html_to_text_sub_worker,
        args=(temp_file_path, html_content, render_anchor_tag_content, is_rss)
    )
    p.start()
    p.join(timeout)

    if p.is_alive():
        p.terminate()
        p.join()

    try:
        with open(temp_file_path, "r", encoding="utf-8") as f:
            result = f.read()
    finally:
        os.remove(temp_file_path)

    return result

# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
def has_ldjson_product_info(content):
    try:
        # Better than .lower() which can use a lot of ram
        if (re.search(r'application/ld\+json', content, re.IGNORECASE) and
            re.search(r'"price"', content, re.IGNORECASE) and
            re.search(r'"pricecurrency"', content, re.IGNORECASE)):
            return True

#       On some pages this is really terribly expensive when they dont really need it
#       (For example you never want price monitoring, but this runs on every watch to suggest it)
#        for filter in LD_JSON_PRODUCT_OFFER_SELECTORS:
#            pricing_data += extract_json_as_string(content=content,
#                                                  json_filter=filter,
#                                                  ensure_is_ldjson_info_type="product")
    except Exception as e:
        # OK too
        return False

    return False


def workarounds_for_obfuscations(content):
    """
    Some sites are using sneaky tactics to make prices and other information un-renderable by Inscriptis
    This could go into its own Pip package in the future, for faster updates
    """

    # HomeDepot.com style <span>$<!-- -->90<!-- -->.<!-- -->74</span>
    # https://github.com/weblyzard/inscriptis/issues/45
    if not content:
        return content

    content = re.sub('<!--\s+-->', '', content)

    return content


def get_triggered_text(content, trigger_text):
    triggered_text = []
    result = strip_ignore_text(content=content,
                               wordlist=trigger_text,
                               mode="line numbers")

    i = 1
    for p in content.splitlines():
        if i in result:
            triggered_text.append(p)
        i += 1

    return triggered_text