Merge branch 'inbound-ap-html' into 'master'

Stop markdownifying received ActivityPub content Closes socialhome/socialhome#198 and socialhome/socialhome#222 See merge request jaywink/federation!160
2020-04-13 18:33:18 +00:00 · 2020-04-13 18:33:18 +00:00 · b4cc7071f4
commit b4cc7071f4
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -32,6 +32,10 @@
  
 * Don't include OStatus for Mastodon 3.0+ protocols list. ([related issue](https://github.com/thefederationinfo/the-federation.info/issues/217))

+* **Backwards incompatible**: Stop markdownifying incoming ActivityPub content. Instead
+  copy it as is to the ``raw_content`` attribute on the entity, setting also the
+  ``_media_type`` to ``text/html``.
+
 ### Fixed

 * Don't crash loudly when fetching webfinger for Diaspora that does not contain XML.
@ -59,6 +63,10 @@

 * Don't try to relay AP payloads to Diaspora receivers and vice versa, for now, until cross-protocol
  relaying is supported.
+  
+* Fix some characters stopping tags being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/222))
+
+* Fix tags separated by slashes being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/198))

 ## [0.19.0] - 2019-12-15

--- a/federation/entities/activitypub/entities.py
+++ b/federation/entities/activitypub/entities.py
@ -3,6 +3,8 @@ import re
 import uuid
 from typing import Dict, List

+import bleach
+
 from federation.entities.activitypub.constants import (
    CONTEXTS_DEFAULT, CONTEXT_MANUALLY_APPROVES_FOLLOWERS, CONTEXT_SENSITIVE, CONTEXT_HASHTAG,
    CONTEXT_LD_SIGNATURES)
@ -57,15 +59,19 @@ class CleanContentMixin(RawContentMixin):
        """
        Make linkified tags normal tags.
        """
-        def cleaner(match):
-            return f"#{match.groups()[0]}"
-
        super().post_receive()
-        self.raw_content = re.sub(
-            r'\[#([\w\-_]+)\]\(http?s://[a-zA-Z0-9/._-]+\)',
-            cleaner,
+
+        def remove_tag_links(attrs, new=False):
+            rel = (None, "rel")
+            if attrs.get(rel) == "tag":
+                return
+            return attrs
+
+        self.raw_content = bleach.linkify(
            self.raw_content,
-            re.MULTILINE,
+            callbacks=[remove_tag_links],
+            parse_email=False,
+            skip_tags=["code", "pre"],
        )


--- a/federation/entities/activitypub/mappers.py
+++ b/federation/entities/activitypub/mappers.py
@ -1,8 +1,6 @@
 import logging
 from typing import List, Callable, Dict, Union, Optional

-from markdownify import markdownify
-
 from federation.entities.activitypub.constants import NAMESPACE_PUBLIC
 from federation.entities.activitypub.entities import (
    ActivitypubFollow, ActivitypubProfile, ActivitypubAccept, ActivitypubPost, ActivitypubComment,
@ -259,19 +257,16 @@ def transform_attribute(
    elif key == "attributedTo" and is_object:
        transformed["actor_id"] = value
    elif key in ("content", "source"):
-        if payload.get('source') and isinstance(payload.get("source"), dict):
+        if payload.get('source') and isinstance(payload.get("source"), dict) and \
+                payload.get('source').get('mediaType') == "text/markdown":
+            transformed["_media_type"] = "text/markdown"
+            transformed["raw_content"] = payload.get('source').get('content').strip()
            transformed["_rendered_content"] = payload.get('content').strip()
-            if payload.get('source').get('mediaType') == "text/markdown":
-                transformed["_media_type"] = "text/markdown"
-                transformed["raw_content"] = payload.get('source').get('content').strip()
-            else:
-                transformed["raw_content"] = markdownify(payload.get('content').strip())
-                transformed["_media_type"] = payload.get('source').get('mediaType')
        else:
-            transformed["raw_content"] = markdownify(payload.get('content').strip()).strip()
            # Assume HTML by convention
-            transformed["_rendered_content"] = payload.get('content').strip()
            transformed["_media_type"] = "text/html"
+            transformed["raw_content"] = payload.get('content').strip()
+            transformed["_rendered_content"] = transformed["raw_content"]
    elif key == "endpoints" and isinstance(value, dict):
        if "inboxes" not in transformed:
            transformed["inboxes"] = {"private": None, "public": None}
--- a/federation/tests/entities/activitypub/test_entities.py
+++ b/federation/tests/entities/activitypub/test_entities.py
@ -409,10 +409,6 @@ class TestEntitiesPostReceive:
            "public": False,
        }]

-    def test_post__post_receive__cleans_linkified_tags(self, activitypubpost_linkified_tags):
-        activitypubpost_linkified_tags.post_receive()
-        assert activitypubpost_linkified_tags.raw_content == '<p>👁️foobar</p><p>barfoo!<br>#fanart #mastoart</p>'
-

 class TestEntitiesPreSend:
    def test_post_inline_images_are_attached(self, activitypubpost_embedded_images):
--- a/federation/tests/entities/activitypub/test_mappers.py
+++ b/federation/tests/entities/activitypub/test_mappers.py
@ -67,7 +67,9 @@ class TestActivitypubEntityMappersReceive:
        post = entities[0]
        assert isinstance(post, ActivitypubPost)
        assert isinstance(post, Post)
-        assert post.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom'
+        assert post.raw_content == '<p><span class="h-card"><a class="u-url mention" ' \
+                                   'href="https://dev.jasonrobinson.me/u/jaywink/">' \
+                                   '@<span>jaywink</span></a></span> boom</p>'
        assert post.rendered_content == '<p><span class="h-card"><a href="https://dev.jasonrobinson.me/u/jaywink/" ' \
                                        'class="u-url mention">@<span>jaywink</span></a></span> boom</p>'
        assert post.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237"
@ -82,7 +84,7 @@ class TestActivitypubEntityMappersReceive:
        post = entities[0]
        assert isinstance(post, ActivitypubPost)
        assert isinstance(post, Post)
-        assert post.raw_content == 'boom #test'
+        assert post.raw_content == '<p>boom #test</p>'

    def test_message_to_objects_simple_post__with_mentions(self):
        entities = message_to_objects(ACTIVITYPUB_POST_WITH_MENTIONS, "https://mastodon.social/users/jaywink")
@ -101,7 +103,9 @@ class TestActivitypubEntityMappersReceive:
        assert isinstance(post, Post)
        assert post.rendered_content == '<p><span class="h-card"><a href="https://dev.jasonrobinson.me/u/jaywink/" ' \
                                        'class="u-url mention">@<span>jaywink</span></a></span> boom</p>'
-        assert post.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom\n\n'
+        assert post.raw_content == '<p><span class="h-card"><a class="u-url mention" ' \
+                                   'href="https://dev.jasonrobinson.me/u/jaywink/">' \
+                                   '@<span>jaywink</span></a></span> boom</p>'

    def test_message_to_objects_simple_post__with_source__markdown(self):
        entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_MARKDOWN, "https://diaspodon.fr/users/jaywink")
@ -141,7 +145,9 @@ class TestActivitypubEntityMappersReceive:
        comment = entities[0]
        assert isinstance(comment, ActivitypubComment)
        assert isinstance(comment, Comment)
-        assert comment.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom'
+        assert comment.raw_content == '<p><span class="h-card"><a class="u-url mention" ' \
+                                      'href="https://dev.jasonrobinson.me/u/jaywink/">' \
+                                      '@<span>jaywink</span></a></span> boom</p>'
        assert comment.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237"
        assert comment.actor_id == "https://diaspodon.fr/users/jaywink"
        assert comment.target_id == "https://dev.jasonrobinson.me/content/653bad70-41b3-42c9-89cb-c4ee587e68e4/"
--- a/federation/tests/fixtures/entities.py
+++ b/federation/tests/fixtures/entities.py
@ -4,7 +4,7 @@ from freezegun import freeze_time
 from federation.entities.activitypub.entities import (
    ActivitypubPost, ActivitypubAccept, ActivitypubFollow, ActivitypubProfile, ActivitypubComment,
    ActivitypubRetraction, ActivitypubShare, ActivitypubImage)
-from federation.entities.base import Profile, Image
+from federation.entities.base import Profile
 from federation.entities.diaspora.entities import (
    DiasporaPost, DiasporaComment, DiasporaLike, DiasporaProfile, DiasporaRetraction,
    DiasporaContact, DiasporaReshare,
@ -144,20 +144,6 @@ https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a902
        )


-@pytest.fixture
-def activitypubpost_linkified_tags():
-    with freeze_time("2019-04-27"):
-        return ActivitypubPost(
-            raw_content='<p>👁️foobar</p><p>barfoo!<br>[#fanart](https://mastodon.art/tags/fanart) '
-                        '[#mastoart](https://mastodon.art/tags/mastoart)</p>',
-            public=True,
-            provider_display_name="Mastodon",
-            id=f"http://127.0.0.1:8000/post/123456/",
-            activity_id=f"http://127.0.0.1:8000/post/123456/#create",
-            actor_id=f"http://127.0.0.1:8000/profile/123456/",
-        )
-
-
@pytest.fixture
 def activitypubprofile():
    return ActivitypubProfile(
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@ -34,12 +34,13 @@ class TestFindTags:
        assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n    #alsocode\n"

    def test_endings_are_filtered_out(self):
-        source = "#parenthesis) #exp! #list]"
+        source = "#parenthesis) #exp! #list] *#doh* _#bah_ #gah% #foo/#bar"
        tags, text = find_tags(source)
-        assert tags == {"parenthesis", "exp", "list"}
+        assert tags == {"parenthesis", "exp", "list", "doh", "bah", "gah", "foo", "bar"}
        assert text == source
        tags, text = find_tags(source, replacer=self._replacer)
-        assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]"
+        assert text == "#parenthesis/parenthesis) #exp/exp! #list/list] *#doh/doh* _#bah/bah_ #gah/gah% " \
+                       "#foo/foo/#bar/bar"

    def test_finds_tags(self):
        source = "#post **Foobar** #tag #OtherTag #third\n#fourth"
@ -49,6 +50,14 @@ class TestFindTags:
        tags, text = find_tags(source, replacer=self._replacer)
        assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth"

+    def test_ok_with_html_tags_in_text(self):
+        source = "<p>#starting and <span>#MixED</span> however not <#>this</#> or <#/>that"
+        tags, text = find_tags(source)
+        assert tags == {"starting", "mixed"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "<p>#starting/starting and <span>#MixED/mixed</span> however not <#>this</#> or <#/>that"
+
    def test_postfixed_tags(self):
        source = "#foo) #bar] #hoo, #hee."
        tags, text = find_tags(source)
@ -66,7 +75,7 @@ class TestFindTags:
        assert text == "(#foo/foo [#bar/bar"

    def test_invalid_text_returns_no_tags(self):
-        source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \
+        source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a(a #a)a #a=a " \
                 "#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd"
        tags, text = find_tags(source)
        assert tags == set()
@ -74,6 +83,14 @@ class TestFindTags:
        tags, text = find_tags(source, replacer=self._replacer)
        assert text == source

+    def test_start_of_paragraph_in_html_content(self):
+        source = '<p>First line</p><p>#foobar #barfoo</p>'
+        tags, text = find_tags(source)
+        assert tags == {"foobar", "barfoo"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == '<p>First line</p><p>#foobar/foobar #barfoo/barfoo</p>'
+

 class TestProcessTextLinks:
    def test_link_at_start_or_end(self):
@ -97,6 +114,12 @@ class TestProcessTextLinks:
        assert process_text_links('<a href="/streams/tag/foobar">#foobar</a>') == \
               '<a href="/streams/tag/foobar">#foobar</a>'

+    def test_does_not_remove_mention_classes(self):
+        assert process_text_links('<p><span class="h-card"><a href="https://dev.jasonrobinson.me/u/jaywink/" '
+                                  'class="u-url mention">@<span>jaywink</span></a></span> boom</p>') == \
+           '<p><span class="h-card"><a class="u-url mention" href="https://dev.jasonrobinson.me/u/jaywink/" ' \
+           'rel="nofollow" target="_blank">@<span>jaywink</span></a></span> boom</p>'
+

 def test_validate_handle():
    assert validate_handle("foo@bar.com")
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@ -33,7 +33,9 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
    Returns a set of tags and the original or replaced text.
    """
    found_tags = set()
-    lines = text.splitlines(keepends=True)
+    # <br> and <p> tags cause issues in us finding words - add some spacing around them
+    new_text = text.replace("<br>", " <br> ").replace("<p>", " <p> ").replace("</p>", " </p> ")
+    lines = new_text.splitlines(keepends=True)
    final_lines = []
    code_block = False
    final_text = None
@ -49,17 +51,28 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
        # Check each word separately
        words = line.split(" ")
        for word in words:
-            candidate = word.strip().strip("([]),.!?:")
-            if candidate.startswith("#"):
-                candidate = candidate.strip("#")
-                if test_tag(candidate.lower()):
-                    found_tags.add(candidate.lower())
-                    if replacer:
-                        try:
-                            tag_word = word.replace("#%s" % candidate, replacer(candidate))
-                            final_words.append(tag_word)
-                        except Exception:
-                            final_words.append(word)
+            if word.find('#') > -1:
+                candidate = word.strip().strip("([]),.!?:*_%/")
+                if candidate.find('<') > -1 or candidate.find('>') > -1:
+                    # Strip html
+                    candidate = bleach.clean(word, strip=True)
+                # Now split with slashes
+                candidates = candidate.split("/")
+                to_replace = []
+                for candidate in candidates:
+                    if candidate.startswith("#"):
+                        candidate = candidate.strip("#")
+                        if test_tag(candidate.lower()):
+                            found_tags.add(candidate.lower())
+                            to_replace.append(candidate)
+                if replacer:
+                    tag_word = word
+                    try:
+                        for counter, replacee in enumerate(to_replace, 1):
+                            tag_word = tag_word.replace("#%s" % replacee, replacer(replacee))
+                    except Exception:
+                        pass
+                    final_words.append(tag_word)
                else:
                    final_words.append(word)
            else:
@ -67,6 +80,8 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
        final_lines.append(" ".join(final_words))
    if replacer:
        final_text = "".join(final_lines)
+    if final_text:
+        final_text = final_text.replace(" <br> ", "<br>").replace(" <p> ", "<p>").replace(" </p> ", "</p>")
    return found_tags, final_text or text


--- a/setup.py
+++ b/setup.py
@ -36,7 +36,6 @@ setup(
        "lxml>=3.4.0",
        "ipdata>=3.0",
        "iteration_utilities",
-        "markdownify",
        "jsonschema>=2.0.0",
        "pycryptodome>=3.4.10",
        "python-dateutil>=2.4.0",