From 13f0cf0db667755dc85d24e840534abb51769b80 Mon Sep 17 00:00:00 2001
From: Jason Robinson <mail@jasonrobinson.me>
Date: Sun, 12 Apr 2020 23:32:42 +0300
Subject: [PATCH 1/6] Stop markdownifying received ActivityPub content

It was causing more trouble than benefits. Just accept
HTML content into entities raw_content attribute and
let apps deal with it.
---
 CHANGELOG.md                                  |  4 ++++
 federation/entities/activitypub/entities.py   | 20 ++++++++++++-------
 federation/entities/activitypub/mappers.py    | 17 ++++++----------
 .../entities/activitypub/test_entities.py     |  4 ----
 .../entities/activitypub/test_mappers.py      | 14 +++++++++----
 federation/tests/fixtures/entities.py         | 16 +--------------
 setup.py                                      |  1 -
 7 files changed, 34 insertions(+), 42 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a4c698f..e470a3a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -32,6 +32,10 @@
   
 * Don't include OStatus for Mastodon 3.0+ protocols list. ([related issue](https://github.com/thefederationinfo/the-federation.info/issues/217))
 
+* **Backwards incompatible**: Stop markdownifying incoming ActivityPub content. Instead
+  copy it as is to the ``raw_content`` attribute on the entity, setting also the
+  ``_media_type`` to ``text/html``.
+
 ### Fixed
 
 * Don't crash loudly when fetching webfinger for Diaspora that does not contain XML.
diff --git a/federation/entities/activitypub/entities.py b/federation/entities/activitypub/entities.py
index d8163cb..0b88fee 100644
--- a/federation/entities/activitypub/entities.py
+++ b/federation/entities/activitypub/entities.py
@@ -3,6 +3,8 @@ import re
 import uuid
 from typing import Dict, List
 
+import bleach
+
 from federation.entities.activitypub.constants import (
     CONTEXTS_DEFAULT, CONTEXT_MANUALLY_APPROVES_FOLLOWERS, CONTEXT_SENSITIVE, CONTEXT_HASHTAG,
     CONTEXT_LD_SIGNATURES)
@@ -57,15 +59,19 @@ class CleanContentMixin(RawContentMixin):
         """
         Make linkified tags normal tags.
         """
-        def cleaner(match):
-            return f"#{match.groups()[0]}"
-
         super().post_receive()
-        self.raw_content = re.sub(
-            r'\[#([\w\-_]+)\]\(http?s://[a-zA-Z0-9/._-]+\)',
-            cleaner,
+
+        def remove_tag_links(attrs, new=False):
+            rel = (None, "rel")
+            if attrs.get(rel) == "tag":
+                return
+            return attrs
+
+        self.raw_content = bleach.linkify(
             self.raw_content,
-            re.MULTILINE,
+            callbacks=[remove_tag_links],
+            parse_email=False,
+            skip_tags=["code", "pre"],
         )
 
 
diff --git a/federation/entities/activitypub/mappers.py b/federation/entities/activitypub/mappers.py
index 4254f96..1506a48 100644
--- a/federation/entities/activitypub/mappers.py
+++ b/federation/entities/activitypub/mappers.py
@@ -1,8 +1,6 @@
 import logging
 from typing import List, Callable, Dict, Union, Optional
 
-from markdownify import markdownify
-
 from federation.entities.activitypub.constants import NAMESPACE_PUBLIC
 from federation.entities.activitypub.entities import (
     ActivitypubFollow, ActivitypubProfile, ActivitypubAccept, ActivitypubPost, ActivitypubComment,
@@ -259,19 +257,16 @@ def transform_attribute(
     elif key == "attributedTo" and is_object:
         transformed["actor_id"] = value
     elif key in ("content", "source"):
-        if payload.get('source') and isinstance(payload.get("source"), dict):
+        if payload.get('source') and isinstance(payload.get("source"), dict) and \
+                payload.get('source').get('mediaType') == "text/markdown":
+            transformed["_media_type"] = "text/markdown"
+            transformed["raw_content"] = payload.get('source').get('content').strip()
             transformed["_rendered_content"] = payload.get('content').strip()
-            if payload.get('source').get('mediaType') == "text/markdown":
-                transformed["_media_type"] = "text/markdown"
-                transformed["raw_content"] = payload.get('source').get('content').strip()
-            else:
-                transformed["raw_content"] = markdownify(payload.get('content').strip())
-                transformed["_media_type"] = payload.get('source').get('mediaType')
         else:
-            transformed["raw_content"] = markdownify(payload.get('content').strip()).strip()
             # Assume HTML by convention
-            transformed["_rendered_content"] = payload.get('content').strip()
             transformed["_media_type"] = "text/html"
+            transformed["raw_content"] = payload.get('content').strip()
+            transformed["_rendered_content"] = transformed["raw_content"]
     elif key == "endpoints" and isinstance(value, dict):
         if "inboxes" not in transformed:
             transformed["inboxes"] = {"private": None, "public": None}
diff --git a/federation/tests/entities/activitypub/test_entities.py b/federation/tests/entities/activitypub/test_entities.py
index 87ec829..7ad874b 100644
--- a/federation/tests/entities/activitypub/test_entities.py
+++ b/federation/tests/entities/activitypub/test_entities.py
@@ -409,10 +409,6 @@ class TestEntitiesPostReceive:
             "public": False,
         }]
 
-    def test_post__post_receive__cleans_linkified_tags(self, activitypubpost_linkified_tags):
-        activitypubpost_linkified_tags.post_receive()
-        assert activitypubpost_linkified_tags.raw_content == '<p>👁️foobar</p><p>barfoo!<br>#fanart #mastoart</p>'
-
 
 class TestEntitiesPreSend:
     def test_post_inline_images_are_attached(self, activitypubpost_embedded_images):
diff --git a/federation/tests/entities/activitypub/test_mappers.py b/federation/tests/entities/activitypub/test_mappers.py
index c50bfae..f8c5628 100644
--- a/federation/tests/entities/activitypub/test_mappers.py
+++ b/federation/tests/entities/activitypub/test_mappers.py
@@ -67,7 +67,9 @@ class TestActivitypubEntityMappersReceive:
         post = entities[0]
         assert isinstance(post, ActivitypubPost)
         assert isinstance(post, Post)
-        assert post.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom'
+        assert post.raw_content == '<p><span class="h-card"><a class="u-url mention" ' \
+                                   'href="https://dev.jasonrobinson.me/u/jaywink/">' \
+                                   '@<span>jaywink</span></a></span> boom</p>'
         assert post.rendered_content == '<p><span class="h-card"><a href="https://dev.jasonrobinson.me/u/jaywink/" ' \
                                         'class="u-url mention">@<span>jaywink</span></a></span> boom</p>'
         assert post.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237"
@@ -82,7 +84,7 @@ class TestActivitypubEntityMappersReceive:
         post = entities[0]
         assert isinstance(post, ActivitypubPost)
         assert isinstance(post, Post)
-        assert post.raw_content == 'boom #test'
+        assert post.raw_content == '<p>boom #test</p>'
 
     def test_message_to_objects_simple_post__with_mentions(self):
         entities = message_to_objects(ACTIVITYPUB_POST_WITH_MENTIONS, "https://mastodon.social/users/jaywink")
@@ -101,7 +103,9 @@ class TestActivitypubEntityMappersReceive:
         assert isinstance(post, Post)
         assert post.rendered_content == '<p><span class="h-card"><a href="https://dev.jasonrobinson.me/u/jaywink/" ' \
                                         'class="u-url mention">@<span>jaywink</span></a></span> boom</p>'
-        assert post.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom\n\n'
+        assert post.raw_content == '<p><span class="h-card"><a class="u-url mention" ' \
+                                   'href="https://dev.jasonrobinson.me/u/jaywink/">' \
+                                   '@<span>jaywink</span></a></span> boom</p>'
 
     def test_message_to_objects_simple_post__with_source__markdown(self):
         entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_MARKDOWN, "https://diaspodon.fr/users/jaywink")
@@ -141,7 +145,9 @@ class TestActivitypubEntityMappersReceive:
         comment = entities[0]
         assert isinstance(comment, ActivitypubComment)
         assert isinstance(comment, Comment)
-        assert comment.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom'
+        assert comment.raw_content == '<p><span class="h-card"><a class="u-url mention" ' \
+                                      'href="https://dev.jasonrobinson.me/u/jaywink/">' \
+                                      '@<span>jaywink</span></a></span> boom</p>'
         assert comment.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237"
         assert comment.actor_id == "https://diaspodon.fr/users/jaywink"
         assert comment.target_id == "https://dev.jasonrobinson.me/content/653bad70-41b3-42c9-89cb-c4ee587e68e4/"
diff --git a/federation/tests/fixtures/entities.py b/federation/tests/fixtures/entities.py
index 317f66d..268f059 100644
--- a/federation/tests/fixtures/entities.py
+++ b/federation/tests/fixtures/entities.py
@@ -4,7 +4,7 @@ from freezegun import freeze_time
 from federation.entities.activitypub.entities import (
     ActivitypubPost, ActivitypubAccept, ActivitypubFollow, ActivitypubProfile, ActivitypubComment,
     ActivitypubRetraction, ActivitypubShare, ActivitypubImage)
-from federation.entities.base import Profile, Image
+from federation.entities.base import Profile
 from federation.entities.diaspora.entities import (
     DiasporaPost, DiasporaComment, DiasporaLike, DiasporaProfile, DiasporaRetraction,
     DiasporaContact, DiasporaReshare,
@@ -144,20 +144,6 @@ https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a902
         )
 
 
-@pytest.fixture
-def activitypubpost_linkified_tags():
-    with freeze_time("2019-04-27"):
-        return ActivitypubPost(
-            raw_content='<p>👁️foobar</p><p>barfoo!<br>[#fanart](https://mastodon.art/tags/fanart) '
-                        '[#mastoart](https://mastodon.art/tags/mastoart)</p>',
-            public=True,
-            provider_display_name="Mastodon",
-            id=f"http://127.0.0.1:8000/post/123456/",
-            activity_id=f"http://127.0.0.1:8000/post/123456/#create",
-            actor_id=f"http://127.0.0.1:8000/profile/123456/",
-        )
-
-
 @pytest.fixture
 def activitypubprofile():
     return ActivitypubProfile(
diff --git a/setup.py b/setup.py
index c59182d..0757dee 100644
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,6 @@ setup(
         "lxml>=3.4.0",
         "ipdata>=3.0",
         "iteration_utilities",
-        "markdownify",
         "jsonschema>=2.0.0",
         "pycryptodome>=3.4.10",
         "python-dateutil>=2.4.0",

From f704175a218a1c3eceb1c8f6e3c6b4083e78227d Mon Sep 17 00:00:00 2001
From: Jason Robinson <mail@jasonrobinson.me>
Date: Mon, 13 Apr 2020 11:35:54 +0300
Subject: [PATCH 2/6] Ensure tags are found also if wrapped within HTML blocks

---
 federation/tests/utils/test_text.py |  8 ++++++++
 federation/utils/text.py            | 28 +++++++++++++++++-----------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py
index 6f0c1b1..8f406c1 100644
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@@ -49,6 +49,14 @@ class TestFindTags:
         tags, text = find_tags(source, replacer=self._replacer)
         assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth"
 
+    def test_ok_with_html_tags_in_text(self):
+        source = "<p>#starting and <span>#MixED</span> however not <#>this</#> or <#/>that"
+        tags, text = find_tags(source)
+        assert tags == {"starting", "mixed"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "<p>#starting/starting and <span>#MixED/mixed</span> however not <#>this</#> or <#/>that"
+
     def test_postfixed_tags(self):
         source = "#foo) #bar] #hoo, #hee."
         tags, text = find_tags(source)
diff --git a/federation/utils/text.py b/federation/utils/text.py
index 8bfb02b..93bf311 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -49,17 +49,23 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
         # Check each word separately
         words = line.split(" ")
         for word in words:
-            candidate = word.strip().strip("([]),.!?:")
-            if candidate.startswith("#"):
-                candidate = candidate.strip("#")
-                if test_tag(candidate.lower()):
-                    found_tags.add(candidate.lower())
-                    if replacer:
-                        try:
-                            tag_word = word.replace("#%s" % candidate, replacer(candidate))
-                            final_words.append(tag_word)
-                        except Exception:
-                            final_words.append(word)
+            if word.find('#') > -1:
+                candidate = word.strip().strip("([]),.!?:")
+                if candidate.find('<') > -1 or candidate.find('>') > -1:
+                    # Strip html
+                    candidate = bleach.clean(word, strip=True)
+                if candidate.startswith("#"):
+                    candidate = candidate.strip("#")
+                    if test_tag(candidate.lower()):
+                        found_tags.add(candidate.lower())
+                        if replacer:
+                            try:
+                                tag_word = word.replace("#%s" % candidate, replacer(candidate))
+                                final_words.append(tag_word)
+                            except Exception:
+                                final_words.append(word)
+                    else:
+                        final_words.append(word)
                 else:
                     final_words.append(word)
             else:

From a2653239d66ee6a97f06192aeba9f66baa4386f3 Mon Sep 17 00:00:00 2001
From: Jason Robinson <mail@jasonrobinson.me>
Date: Mon, 13 Apr 2020 11:46:10 +0300
Subject: [PATCH 3/6] Fix some characters stopping tags being identified

Closes socialhome/socialhome#222
---
 CHANGELOG.md                        | 2 ++
 federation/tests/utils/test_text.py | 6 +++---
 federation/utils/text.py            | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e470a3a..fcd8726 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -63,6 +63,8 @@
 
 * Don't try to relay AP payloads to Diaspora receivers and vice versa, for now, until cross-protocol
   relaying is supported.
+  
+* Fix some characters stopping tags being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/222))
 
 ## [0.19.0] - 2019-12-15
 
diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py
index 8f406c1..fc39abe 100644
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@@ -34,12 +34,12 @@ class TestFindTags:
         assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n    #alsocode\n"
 
     def test_endings_are_filtered_out(self):
-        source = "#parenthesis) #exp! #list]"
+        source = "#parenthesis) #exp! #list] *#doh* _#bah_ #gah%"
         tags, text = find_tags(source)
-        assert tags == {"parenthesis", "exp", "list"}
+        assert tags == {"parenthesis", "exp", "list", "doh", "bah", "gah"}
         assert text == source
         tags, text = find_tags(source, replacer=self._replacer)
-        assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]"
+        assert text == "#parenthesis/parenthesis) #exp/exp! #list/list] *#doh/doh* _#bah/bah_ #gah/gah%"
 
     def test_finds_tags(self):
         source = "#post **Foobar** #tag #OtherTag #third\n#fourth"
diff --git a/federation/utils/text.py b/federation/utils/text.py
index 93bf311..7cbe206 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -50,7 +50,7 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
         words = line.split(" ")
         for word in words:
             if word.find('#') > -1:
-                candidate = word.strip().strip("([]),.!?:")
+                candidate = word.strip().strip("([]),.!?:*_%")
                 if candidate.find('<') > -1 or candidate.find('>') > -1:
                     # Strip html
                     candidate = bleach.clean(word, strip=True)

From 7c1f1670b1a3bf4f7856d9bd9ad647457280430a Mon Sep 17 00:00:00 2001
From: Jason Robinson <mail@jasonrobinson.me>
Date: Mon, 13 Apr 2020 12:40:35 +0300
Subject: [PATCH 4/6] Fix tags separated by slashes not being identified

Fixes socialhome/socialhome#198
---
 CHANGELOG.md                        |  2 ++
 federation/tests/utils/test_text.py |  9 +++++----
 federation/utils/text.py            | 31 +++++++++++++++++------------
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fcd8726..842ce30 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -66,6 +66,8 @@
   
 * Fix some characters stopping tags being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/222))
 
+* Fix tags separated by slashes being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/198))
+
 ## [0.19.0] - 2019-12-15
 
 ### Added
diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py
index fc39abe..9073807 100644
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@@ -34,12 +34,13 @@ class TestFindTags:
         assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n    #alsocode\n"
 
     def test_endings_are_filtered_out(self):
-        source = "#parenthesis) #exp! #list] *#doh* _#bah_ #gah%"
+        source = "#parenthesis) #exp! #list] *#doh* _#bah_ #gah% #foo/#bar"
         tags, text = find_tags(source)
-        assert tags == {"parenthesis", "exp", "list", "doh", "bah", "gah"}
+        assert tags == {"parenthesis", "exp", "list", "doh", "bah", "gah", "foo", "bar"}
         assert text == source
         tags, text = find_tags(source, replacer=self._replacer)
-        assert text == "#parenthesis/parenthesis) #exp/exp! #list/list] *#doh/doh* _#bah/bah_ #gah/gah%"
+        assert text == "#parenthesis/parenthesis) #exp/exp! #list/list] *#doh/doh* _#bah/bah_ #gah/gah% " \
+                       "#foo/foo/#bar/bar"
 
     def test_finds_tags(self):
         source = "#post **Foobar** #tag #OtherTag #third\n#fourth"
@@ -74,7 +75,7 @@ class TestFindTags:
         assert text == "(#foo/foo [#bar/bar"
 
     def test_invalid_text_returns_no_tags(self):
-        source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \
+        source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a(a #a)a #a=a " \
                  "#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd"
         tags, text = find_tags(source)
         assert tags == set()
diff --git a/federation/utils/text.py b/federation/utils/text.py
index 7cbe206..9ad2cfe 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -50,22 +50,27 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
         words = line.split(" ")
         for word in words:
             if word.find('#') > -1:
-                candidate = word.strip().strip("([]),.!?:*_%")
+                candidate = word.strip().strip("([]),.!?:*_%/")
                 if candidate.find('<') > -1 or candidate.find('>') > -1:
                     # Strip html
                     candidate = bleach.clean(word, strip=True)
-                if candidate.startswith("#"):
-                    candidate = candidate.strip("#")
-                    if test_tag(candidate.lower()):
-                        found_tags.add(candidate.lower())
-                        if replacer:
-                            try:
-                                tag_word = word.replace("#%s" % candidate, replacer(candidate))
-                                final_words.append(tag_word)
-                            except Exception:
-                                final_words.append(word)
-                    else:
-                        final_words.append(word)
+                # Now split with slashes
+                candidates = candidate.split("/")
+                to_replace = []
+                for candidate in candidates:
+                    if candidate.startswith("#"):
+                        candidate = candidate.strip("#")
+                        if test_tag(candidate.lower()):
+                            found_tags.add(candidate.lower())
+                            to_replace.append(candidate)
+                if replacer:
+                    tag_word = word
+                    try:
+                        for counter, replacee in enumerate(to_replace, 1):
+                            tag_word = tag_word.replace("#%s" % replacee, replacer(replacee))
+                    except Exception:
+                        pass
+                    final_words.append(tag_word)
                 else:
                     final_words.append(word)
             else:

From dfaa692ea4dcda1edb2a12a24306487b4ec3e026 Mon Sep 17 00:00:00 2001
From: Jason Robinson <mail@jasonrobinson.me>
Date: Mon, 13 Apr 2020 13:22:12 +0300
Subject: [PATCH 5/6] Add test ensuring mention classes are not stripped

By process_text_links text util.
---
 federation/tests/utils/test_text.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py
index 9073807..2efa917 100644
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@@ -106,6 +106,12 @@ class TestProcessTextLinks:
         assert process_text_links('<a href="/streams/tag/foobar">#foobar</a>') == \
                '<a href="/streams/tag/foobar">#foobar</a>'
 
+    def test_does_not_remove_mention_classes(self):
+        assert process_text_links('<p><span class="h-card"><a href="https://dev.jasonrobinson.me/u/jaywink/" '
+                                  'class="u-url mention">@<span>jaywink</span></a></span> boom</p>') == \
+           '<p><span class="h-card"><a class="u-url mention" href="https://dev.jasonrobinson.me/u/jaywink/" ' \
+           'rel="nofollow" target="_blank">@<span>jaywink</span></a></span> boom</p>'
+
 
 def test_validate_handle():
     assert validate_handle("foo@bar.com")

From 9469101549e6218e63a9d43f6d3b42b25c5a0e95 Mon Sep 17 00:00:00 2001
From: Jason Robinson <mail@jasonrobinson.me>
Date: Mon, 13 Apr 2020 21:28:22 +0300
Subject: [PATCH 6/6] Fix tag recognition if at start of HTML paragraph

---
 federation/tests/utils/test_text.py | 8 ++++++++
 federation/utils/text.py            | 6 +++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py
index 2efa917..fa69b6b 100644
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@@ -83,6 +83,14 @@ class TestFindTags:
         tags, text = find_tags(source, replacer=self._replacer)
         assert text == source
 
+    def test_start_of_paragraph_in_html_content(self):
+        source = '<p>First line</p><p>#foobar #barfoo</p>'
+        tags, text = find_tags(source)
+        assert tags == {"foobar", "barfoo"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == '<p>First line</p><p>#foobar/foobar #barfoo/barfoo</p>'
+
 
 class TestProcessTextLinks:
     def test_link_at_start_or_end(self):
diff --git a/federation/utils/text.py b/federation/utils/text.py
index 9ad2cfe..cebed5a 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -33,7 +33,9 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
     Returns a set of tags and the original or replaced text.
     """
     found_tags = set()
-    lines = text.splitlines(keepends=True)
+    # <br> and <p> tags cause issues in us finding words - add some spacing around them
+    new_text = text.replace("<br>", " <br> ").replace("<p>", " <p> ").replace("</p>", " </p> ")
+    lines = new_text.splitlines(keepends=True)
     final_lines = []
     code_block = False
     final_text = None
@@ -78,6 +80,8 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
         final_lines.append(" ".join(final_words))
     if replacer:
         final_text = "".join(final_lines)
+    if final_text:
+        final_text = final_text.replace(" <br> ", "<br>").replace(" <p> ", "<p>").replace(" </p> ", "</p>")
     return found_tags, final_text or text