Fix tag recognition if at start of HTML paragraph

2020-04-13 21:28:22 +03:00 · 2020-04-13 21:28:22 +03:00 · 9469101549
commit 9469101549
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@ -83,6 +83,14 @@ class TestFindTags:
        tags, text = find_tags(source, replacer=self._replacer)
        assert text == source

+    def test_start_of_paragraph_in_html_content(self):
+        source = '<p>First line</p><p>#foobar #barfoo</p>'
+        tags, text = find_tags(source)
+        assert tags == {"foobar", "barfoo"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == '<p>First line</p><p>#foobar/foobar #barfoo/barfoo</p>'
+

 class TestProcessTextLinks:
    def test_link_at_start_or_end(self):
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@ -33,7 +33,9 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
    Returns a set of tags and the original or replaced text.
    """
    found_tags = set()
-    lines = text.splitlines(keepends=True)
+    # <br> and <p> tags cause issues in us finding words - add some spacing around them
+    new_text = text.replace("<br>", " <br> ").replace("<p>", " <p> ").replace("</p>", " </p> ")
+    lines = new_text.splitlines(keepends=True)
    final_lines = []
    code_block = False
    final_text = None
@ -78,6 +80,8 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
        final_lines.append(" ".join(final_words))
    if replacer:
        final_text = "".join(final_lines)
+    if final_text:
+        final_text = final_text.replace(" <br> ", "<br>").replace(" <p> ", "<p>").replace(" </p> ", "</p>")
    return found_tags, final_text or text