Refactor rich text rewriter to introduce a TagMatch object

2024-05-17 20:11:02 +01:00 · 2024-05-17 20:11:02 +01:00 · 7d2f485e97
commit 7d2f485e97
--- a/wagtail/rich_text/rewriters.py
+++ b/wagtail/rich_text/rewriters.py
@ -4,7 +4,9 @@ Utility classes for rewriting elements of HTML-like strings

 import re
 from collections import defaultdict
-from typing import Callable, Tuple
+from typing import Callable, Dict, List
+
+from django.utils.functional import cached_property

 FIND_A_TAG = re.compile(r"<a(\b[^>]*)>")
 FIND_EMBED_TAG = re.compile(r"<embed(\b[^>]*)/>")
@ -27,6 +29,26 @@ def extract_attrs(attr_string: str) -> dict:
    return attributes


+class TagMatch:
+    """Represents a single matched tag in a rich text string"""
+
+    def __init__(self, match):
+        self.match = match  # a regexp match object
+        self.replacement = None  # to be filled in by the rewriter
+
+    @cached_property
+    def attrs(self):
+        return extract_attrs(self.match.group(1))
+
+    @property
+    def start(self):
+        return self.match.start()
+
+    @property
+    def end(self):
+        return self.match.end()
+
+
 class TagRewriter:
    def __init__(self, rules=None, bulk_rules=None, reference_extractors=None):
        self.rules = rules or {}
@ -37,61 +59,65 @@ class TagRewriter:
        raise NotImplementedError

    def get_tag_type_from_attrs(self, attrs):
+        """Given a dict of attributes from a tag, return the tag type."""
        raise NotImplementedError

    def get_tag_replacements(self, tag_type, attrs_list):
-        # Note: return an empty list for cases when you don't want any replacements made
+        """Given a list of attribute dicts, all taken from tags of the same type, return a
+        corresponding list of replacement strings to replace the tags with.
+
+        Return an empty list for cases when you don't want any replacements made.
+        """
        raise NotImplementedError

    def __call__(self, html: str) -> str:
-        matches_by_tag_type, attrs_by_tag_type = self.extract_tags(html)
-
-        replacements = [
-            self.get_tag_replacements(tag_type, attrs_list)
-            for tag_type, attrs_list in attrs_by_tag_type.items()
-        ]
-
+        matches_by_tag_type = self.extract_tags(html)
        matches_to_replace = []
-        for matches, replacements in zip(matches_by_tag_type.values(), replacements):
+
+        # For each tag type, get the list of replacement strings for all tags of that type
+        for tag_type, tag_matches in matches_by_tag_type.items():
+            attr_dicts = [match.attrs for match in tag_matches]
+            replacements = self.get_tag_replacements(tag_type, attr_dicts)
+
            if not replacements:
                continue

-            matches_to_replace.extend(zip(matches, replacements))
+            for match, replacement in zip(tag_matches, replacements):
+                match.replacement = replacement
+                matches_to_replace.append(match)
+
+        # Replace the tags in order of appearance in the string, so that offsets remain valid
+        matches_to_replace.sort(key=lambda match: match.start)

        offset = 0
-        for match, replacement in sorted(
-            matches_to_replace, key=lambda match_to_replace: match_to_replace[0].start()
-        ):
+        for match in matches_to_replace:
            html = (
-                html[: match.start() + offset]
-                + replacement
-                + html[match.end() + offset :]
+                html[: match.start + offset]
+                + match.replacement
+                + html[match.end + offset :]
            )

-            offset += len(replacement) - match.end() + match.start()
+            offset += len(match.replacement) - match.end + match.start

        return html

-    def extract_tags(self, html: str) -> Tuple[dict, dict]:
+    def extract_tags(self, html: str) -> Dict[str, List[TagMatch]]:
        """Helper method to extract and group HTML tags and their attributes.

-        Returns the full list of regex matches grouped by tag type as well as
-        the tag attribute dictionaries grouped by tag type.
+        Returns a dict of TagMatch objects, mapping tag types to a list of all TagMatch objects of that tag type.
        """
        matches_by_tag_type = defaultdict(list)
-        attrs_by_tag_type = defaultdict(list)

        # Regex used to match <tag ...> tags in the HTML.
        re_pattern = self.get_opening_tag_regex()

-        for match in re_pattern.finditer(html):
-            attrs = extract_attrs(match.group(1))
-            tag_type = self.get_tag_type_from_attrs(attrs)
+        for re_match in re_pattern.finditer(html):
+            tag_match = TagMatch(re_match)
+            tag_type = self.get_tag_type_from_attrs(tag_match.attrs)

-            matches_by_tag_type[tag_type].append(match)
-            attrs_by_tag_type[tag_type].append(attrs)
+            matches_by_tag_type[tag_type].append(tag_match)

-        return matches_by_tag_type, attrs_by_tag_type
+        return matches_by_tag_type

    def convert_rule_to_bulk_rule(self, rule: Callable) -> Callable:
        def bulk_rule(args):