Refactor rich text rewriter to introduce a TagMatch object

pull/11961/head
Matt Westcott 2024-05-17 20:11:02 +01:00
rodzic 83af49327b
commit 7d2f485e97
1 zmienionych plików z 54 dodań i 28 usunięć

Wyświetl plik

@ -4,7 +4,9 @@ Utility classes for rewriting elements of HTML-like strings
import re
from collections import defaultdict
from typing import Callable, Tuple
from typing import Callable, Dict, List
from django.utils.functional import cached_property
FIND_A_TAG = re.compile(r"<a(\b[^>]*)>")
FIND_EMBED_TAG = re.compile(r"<embed(\b[^>]*)/>")
@ -27,6 +29,26 @@ def extract_attrs(attr_string: str) -> dict:
return attributes
class TagMatch:
"""Represents a single matched tag in a rich text string"""
def __init__(self, match):
self.match = match # a regexp match object
self.replacement = None # to be filled in by the rewriter
@cached_property
def attrs(self):
return extract_attrs(self.match.group(1))
@property
def start(self):
return self.match.start()
@property
def end(self):
return self.match.end()
class TagRewriter:
def __init__(self, rules=None, bulk_rules=None, reference_extractors=None):
self.rules = rules or {}
@ -37,61 +59,65 @@ class TagRewriter:
raise NotImplementedError
def get_tag_type_from_attrs(self, attrs):
"""Given a dict of attributes from a tag, return the tag type."""
raise NotImplementedError
def get_tag_replacements(self, tag_type, attrs_list):
# Note: return an empty list for cases when you don't want any replacements made
"""Given a list of attribute dicts, all taken from tags of the same type, return a
corresponding list of replacement strings to replace the tags with.
Return an empty list for cases when you don't want any replacements made.
"""
raise NotImplementedError
def __call__(self, html: str) -> str:
matches_by_tag_type, attrs_by_tag_type = self.extract_tags(html)
replacements = [
self.get_tag_replacements(tag_type, attrs_list)
for tag_type, attrs_list in attrs_by_tag_type.items()
]
matches_by_tag_type = self.extract_tags(html)
matches_to_replace = []
for matches, replacements in zip(matches_by_tag_type.values(), replacements):
# For each tag type, get the list of replacement strings for all tags of that type
for tag_type, tag_matches in matches_by_tag_type.items():
attr_dicts = [match.attrs for match in tag_matches]
replacements = self.get_tag_replacements(tag_type, attr_dicts)
if not replacements:
continue
matches_to_replace.extend(zip(matches, replacements))
for match, replacement in zip(tag_matches, replacements):
match.replacement = replacement
matches_to_replace.append(match)
# Replace the tags in order of appearance in the string, so that offsets remain valid
matches_to_replace.sort(key=lambda match: match.start)
offset = 0
for match, replacement in sorted(
matches_to_replace, key=lambda match_to_replace: match_to_replace[0].start()
):
for match in matches_to_replace:
html = (
html[: match.start() + offset]
+ replacement
+ html[match.end() + offset :]
html[: match.start + offset]
+ match.replacement
+ html[match.end + offset :]
)
offset += len(replacement) - match.end() + match.start()
offset += len(match.replacement) - match.end + match.start
return html
def extract_tags(self, html: str) -> Tuple[dict, dict]:
def extract_tags(self, html: str) -> Dict[str, List[TagMatch]]:
"""Helper method to extract and group HTML tags and their attributes.
Returns the full list of regex matches grouped by tag type as well as
the tag attribute dictionaries grouped by tag type.
Returns a dict of TagMatch objects, mapping tag types to a list of all TagMatch objects of that tag type.
"""
matches_by_tag_type = defaultdict(list)
attrs_by_tag_type = defaultdict(list)
# Regex used to match <tag ...> tags in the HTML.
re_pattern = self.get_opening_tag_regex()
for match in re_pattern.finditer(html):
attrs = extract_attrs(match.group(1))
tag_type = self.get_tag_type_from_attrs(attrs)
for re_match in re_pattern.finditer(html):
tag_match = TagMatch(re_match)
tag_type = self.get_tag_type_from_attrs(tag_match.attrs)
matches_by_tag_type[tag_type].append(match)
attrs_by_tag_type[tag_type].append(attrs)
matches_by_tag_type[tag_type].append(tag_match)
return matches_by_tag_type, attrs_by_tag_type
return matches_by_tag_type
def convert_rule_to_bulk_rule(self, rule: Callable) -> Callable:
def bulk_rule(args):