From 13f0cf0db667755dc85d24e840534abb51769b80 Mon Sep 17 00:00:00 2001 From: Jason Robinson Date: Sun, 12 Apr 2020 23:32:42 +0300 Subject: [PATCH 1/6] Stop markdownifying received ActivityPub content It was causing more trouble than benefits. Just accept HTML content into entities raw_content attribute and let apps deal with it. --- CHANGELOG.md | 4 ++++ federation/entities/activitypub/entities.py | 20 ++++++++++++------- federation/entities/activitypub/mappers.py | 17 ++++++---------- .../entities/activitypub/test_entities.py | 4 ---- .../entities/activitypub/test_mappers.py | 14 +++++++++---- federation/tests/fixtures/entities.py | 16 +-------------- setup.py | 1 - 7 files changed, 34 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4c698f..e470a3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,10 @@ * Don't include OStatus for Mastodon 3.0+ protocols list. ([related issue](https://github.com/thefederationinfo/the-federation.info/issues/217)) +* **Backwards incompatible**: Stop markdownifying incoming ActivityPub content. Instead + copy it as is to the ``raw_content`` attribute on the entity, setting also the + ``_media_type`` to ``text/html``. + ### Fixed * Don't crash loudly when fetching webfinger for Diaspora that does not contain XML. diff --git a/federation/entities/activitypub/entities.py b/federation/entities/activitypub/entities.py index d8163cb..0b88fee 100644 --- a/federation/entities/activitypub/entities.py +++ b/federation/entities/activitypub/entities.py @@ -3,6 +3,8 @@ import re import uuid from typing import Dict, List +import bleach + from federation.entities.activitypub.constants import ( CONTEXTS_DEFAULT, CONTEXT_MANUALLY_APPROVES_FOLLOWERS, CONTEXT_SENSITIVE, CONTEXT_HASHTAG, CONTEXT_LD_SIGNATURES) @@ -57,15 +59,19 @@ class CleanContentMixin(RawContentMixin): """ Make linkified tags normal tags. """ - def cleaner(match): - return f"#{match.groups()[0]}" - super().post_receive() - self.raw_content = re.sub( - r'\[#([\w\-_]+)\]\(http?s://[a-zA-Z0-9/._-]+\)', - cleaner, + + def remove_tag_links(attrs, new=False): + rel = (None, "rel") + if attrs.get(rel) == "tag": + return + return attrs + + self.raw_content = bleach.linkify( self.raw_content, - re.MULTILINE, + callbacks=[remove_tag_links], + parse_email=False, + skip_tags=["code", "pre"], ) diff --git a/federation/entities/activitypub/mappers.py b/federation/entities/activitypub/mappers.py index 4254f96..1506a48 100644 --- a/federation/entities/activitypub/mappers.py +++ b/federation/entities/activitypub/mappers.py @@ -1,8 +1,6 @@ import logging from typing import List, Callable, Dict, Union, Optional -from markdownify import markdownify - from federation.entities.activitypub.constants import NAMESPACE_PUBLIC from federation.entities.activitypub.entities import ( ActivitypubFollow, ActivitypubProfile, ActivitypubAccept, ActivitypubPost, ActivitypubComment, @@ -259,19 +257,16 @@ def transform_attribute( elif key == "attributedTo" and is_object: transformed["actor_id"] = value elif key in ("content", "source"): - if payload.get('source') and isinstance(payload.get("source"), dict): + if payload.get('source') and isinstance(payload.get("source"), dict) and \ + payload.get('source').get('mediaType') == "text/markdown": + transformed["_media_type"] = "text/markdown" + transformed["raw_content"] = payload.get('source').get('content').strip() transformed["_rendered_content"] = payload.get('content').strip() - if payload.get('source').get('mediaType') == "text/markdown": - transformed["_media_type"] = "text/markdown" - transformed["raw_content"] = payload.get('source').get('content').strip() - else: - transformed["raw_content"] = markdownify(payload.get('content').strip()) - transformed["_media_type"] = payload.get('source').get('mediaType') else: - transformed["raw_content"] = markdownify(payload.get('content').strip()).strip() # Assume HTML by convention - transformed["_rendered_content"] = payload.get('content').strip() transformed["_media_type"] = "text/html" + transformed["raw_content"] = payload.get('content').strip() + transformed["_rendered_content"] = transformed["raw_content"] elif key == "endpoints" and isinstance(value, dict): if "inboxes" not in transformed: transformed["inboxes"] = {"private": None, "public": None} diff --git a/federation/tests/entities/activitypub/test_entities.py b/federation/tests/entities/activitypub/test_entities.py index 87ec829..7ad874b 100644 --- a/federation/tests/entities/activitypub/test_entities.py +++ b/federation/tests/entities/activitypub/test_entities.py @@ -409,10 +409,6 @@ class TestEntitiesPostReceive: "public": False, }] - def test_post__post_receive__cleans_linkified_tags(self, activitypubpost_linkified_tags): - activitypubpost_linkified_tags.post_receive() - assert activitypubpost_linkified_tags.raw_content == '

👁️foobar

barfoo!
#fanart #mastoart

' - class TestEntitiesPreSend: def test_post_inline_images_are_attached(self, activitypubpost_embedded_images): diff --git a/federation/tests/entities/activitypub/test_mappers.py b/federation/tests/entities/activitypub/test_mappers.py index c50bfae..f8c5628 100644 --- a/federation/tests/entities/activitypub/test_mappers.py +++ b/federation/tests/entities/activitypub/test_mappers.py @@ -67,7 +67,9 @@ class TestActivitypubEntityMappersReceive: post = entities[0] assert isinstance(post, ActivitypubPost) assert isinstance(post, Post) - assert post.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom' + assert post.raw_content == '

' \ + '@jaywink boom

' assert post.rendered_content == '

@jaywink boom

' assert post.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237" @@ -82,7 +84,7 @@ class TestActivitypubEntityMappersReceive: post = entities[0] assert isinstance(post, ActivitypubPost) assert isinstance(post, Post) - assert post.raw_content == 'boom #test' + assert post.raw_content == '

boom #test

' def test_message_to_objects_simple_post__with_mentions(self): entities = message_to_objects(ACTIVITYPUB_POST_WITH_MENTIONS, "https://mastodon.social/users/jaywink") @@ -101,7 +103,9 @@ class TestActivitypubEntityMappersReceive: assert isinstance(post, Post) assert post.rendered_content == '

@jaywink boom

' - assert post.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom\n\n' + assert post.raw_content == '

' \ + '@jaywink boom

' def test_message_to_objects_simple_post__with_source__markdown(self): entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_MARKDOWN, "https://diaspodon.fr/users/jaywink") @@ -141,7 +145,9 @@ class TestActivitypubEntityMappersReceive: comment = entities[0] assert isinstance(comment, ActivitypubComment) assert isinstance(comment, Comment) - assert comment.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom' + assert comment.raw_content == '

' \ + '@jaywink boom

' assert comment.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237" assert comment.actor_id == "https://diaspodon.fr/users/jaywink" assert comment.target_id == "https://dev.jasonrobinson.me/content/653bad70-41b3-42c9-89cb-c4ee587e68e4/" diff --git a/federation/tests/fixtures/entities.py b/federation/tests/fixtures/entities.py index 317f66d..268f059 100644 --- a/federation/tests/fixtures/entities.py +++ b/federation/tests/fixtures/entities.py @@ -4,7 +4,7 @@ from freezegun import freeze_time from federation.entities.activitypub.entities import ( ActivitypubPost, ActivitypubAccept, ActivitypubFollow, ActivitypubProfile, ActivitypubComment, ActivitypubRetraction, ActivitypubShare, ActivitypubImage) -from federation.entities.base import Profile, Image +from federation.entities.base import Profile from federation.entities.diaspora.entities import ( DiasporaPost, DiasporaComment, DiasporaLike, DiasporaProfile, DiasporaRetraction, DiasporaContact, DiasporaReshare, @@ -144,20 +144,6 @@ https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a902 ) -@pytest.fixture -def activitypubpost_linkified_tags(): - with freeze_time("2019-04-27"): - return ActivitypubPost( - raw_content='

👁️foobar

barfoo!
[#fanart](https://mastodon.art/tags/fanart) ' - '[#mastoart](https://mastodon.art/tags/mastoart)

', - public=True, - provider_display_name="Mastodon", - id=f"http://127.0.0.1:8000/post/123456/", - activity_id=f"http://127.0.0.1:8000/post/123456/#create", - actor_id=f"http://127.0.0.1:8000/profile/123456/", - ) - - @pytest.fixture def activitypubprofile(): return ActivitypubProfile( diff --git a/setup.py b/setup.py index c59182d..0757dee 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,6 @@ setup( "lxml>=3.4.0", "ipdata>=3.0", "iteration_utilities", - "markdownify", "jsonschema>=2.0.0", "pycryptodome>=3.4.10", "python-dateutil>=2.4.0", From f704175a218a1c3eceb1c8f6e3c6b4083e78227d Mon Sep 17 00:00:00 2001 From: Jason Robinson Date: Mon, 13 Apr 2020 11:35:54 +0300 Subject: [PATCH 2/6] Ensure tags are found also if wrapped within HTML blocks --- federation/tests/utils/test_text.py | 8 ++++++++ federation/utils/text.py | 28 +++++++++++++++++----------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index 6f0c1b1..8f406c1 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -49,6 +49,14 @@ class TestFindTags: tags, text = find_tags(source, replacer=self._replacer) assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth" + def test_ok_with_html_tags_in_text(self): + source = "

#starting and #MixED however not <#>this or <#/>that" + tags, text = find_tags(source) + assert tags == {"starting", "mixed"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "

#starting/starting and #MixED/mixed however not <#>this or <#/>that" + def test_postfixed_tags(self): source = "#foo) #bar] #hoo, #hee." tags, text = find_tags(source) diff --git a/federation/utils/text.py b/federation/utils/text.py index 8bfb02b..93bf311 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -49,17 +49,23 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]: # Check each word separately words = line.split(" ") for word in words: - candidate = word.strip().strip("([]),.!?:") - if candidate.startswith("#"): - candidate = candidate.strip("#") - if test_tag(candidate.lower()): - found_tags.add(candidate.lower()) - if replacer: - try: - tag_word = word.replace("#%s" % candidate, replacer(candidate)) - final_words.append(tag_word) - except Exception: - final_words.append(word) + if word.find('#') > -1: + candidate = word.strip().strip("([]),.!?:") + if candidate.find('<') > -1 or candidate.find('>') > -1: + # Strip html + candidate = bleach.clean(word, strip=True) + if candidate.startswith("#"): + candidate = candidate.strip("#") + if test_tag(candidate.lower()): + found_tags.add(candidate.lower()) + if replacer: + try: + tag_word = word.replace("#%s" % candidate, replacer(candidate)) + final_words.append(tag_word) + except Exception: + final_words.append(word) + else: + final_words.append(word) else: final_words.append(word) else: From a2653239d66ee6a97f06192aeba9f66baa4386f3 Mon Sep 17 00:00:00 2001 From: Jason Robinson Date: Mon, 13 Apr 2020 11:46:10 +0300 Subject: [PATCH 3/6] Fix some characters stopping tags being identified Closes socialhome/socialhome#222 --- CHANGELOG.md | 2 ++ federation/tests/utils/test_text.py | 6 +++--- federation/utils/text.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e470a3a..fcd8726 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,6 +63,8 @@ * Don't try to relay AP payloads to Diaspora receivers and vice versa, for now, until cross-protocol relaying is supported. + +* Fix some characters stopping tags being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/222)) ## [0.19.0] - 2019-12-15 diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index 8f406c1..fc39abe 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -34,12 +34,12 @@ class TestFindTags: assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n #alsocode\n" def test_endings_are_filtered_out(self): - source = "#parenthesis) #exp! #list]" + source = "#parenthesis) #exp! #list] *#doh* _#bah_ #gah%" tags, text = find_tags(source) - assert tags == {"parenthesis", "exp", "list"} + assert tags == {"parenthesis", "exp", "list", "doh", "bah", "gah"} assert text == source tags, text = find_tags(source, replacer=self._replacer) - assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]" + assert text == "#parenthesis/parenthesis) #exp/exp! #list/list] *#doh/doh* _#bah/bah_ #gah/gah%" def test_finds_tags(self): source = "#post **Foobar** #tag #OtherTag #third\n#fourth" diff --git a/federation/utils/text.py b/federation/utils/text.py index 93bf311..7cbe206 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -50,7 +50,7 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]: words = line.split(" ") for word in words: if word.find('#') > -1: - candidate = word.strip().strip("([]),.!?:") + candidate = word.strip().strip("([]),.!?:*_%") if candidate.find('<') > -1 or candidate.find('>') > -1: # Strip html candidate = bleach.clean(word, strip=True) From 7c1f1670b1a3bf4f7856d9bd9ad647457280430a Mon Sep 17 00:00:00 2001 From: Jason Robinson Date: Mon, 13 Apr 2020 12:40:35 +0300 Subject: [PATCH 4/6] Fix tags separated by slashes not being identified Fixes socialhome/socialhome#198 --- CHANGELOG.md | 2 ++ federation/tests/utils/test_text.py | 9 +++++---- federation/utils/text.py | 31 +++++++++++++++++------------ 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fcd8726..842ce30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -66,6 +66,8 @@ * Fix some characters stopping tags being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/222)) +* Fix tags separated by slashes being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/198)) + ## [0.19.0] - 2019-12-15 ### Added diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index fc39abe..9073807 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -34,12 +34,13 @@ class TestFindTags: assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n #alsocode\n" def test_endings_are_filtered_out(self): - source = "#parenthesis) #exp! #list] *#doh* _#bah_ #gah%" + source = "#parenthesis) #exp! #list] *#doh* _#bah_ #gah% #foo/#bar" tags, text = find_tags(source) - assert tags == {"parenthesis", "exp", "list", "doh", "bah", "gah"} + assert tags == {"parenthesis", "exp", "list", "doh", "bah", "gah", "foo", "bar"} assert text == source tags, text = find_tags(source, replacer=self._replacer) - assert text == "#parenthesis/parenthesis) #exp/exp! #list/list] *#doh/doh* _#bah/bah_ #gah/gah%" + assert text == "#parenthesis/parenthesis) #exp/exp! #list/list] *#doh/doh* _#bah/bah_ #gah/gah% " \ + "#foo/foo/#bar/bar" def test_finds_tags(self): source = "#post **Foobar** #tag #OtherTag #third\n#fourth" @@ -74,7 +75,7 @@ class TestFindTags: assert text == "(#foo/foo [#bar/bar" def test_invalid_text_returns_no_tags(self): - source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \ + source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a(a #a)a #a=a " \ "#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd" tags, text = find_tags(source) assert tags == set() diff --git a/federation/utils/text.py b/federation/utils/text.py index 7cbe206..9ad2cfe 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -50,22 +50,27 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]: words = line.split(" ") for word in words: if word.find('#') > -1: - candidate = word.strip().strip("([]),.!?:*_%") + candidate = word.strip().strip("([]),.!?:*_%/") if candidate.find('<') > -1 or candidate.find('>') > -1: # Strip html candidate = bleach.clean(word, strip=True) - if candidate.startswith("#"): - candidate = candidate.strip("#") - if test_tag(candidate.lower()): - found_tags.add(candidate.lower()) - if replacer: - try: - tag_word = word.replace("#%s" % candidate, replacer(candidate)) - final_words.append(tag_word) - except Exception: - final_words.append(word) - else: - final_words.append(word) + # Now split with slashes + candidates = candidate.split("/") + to_replace = [] + for candidate in candidates: + if candidate.startswith("#"): + candidate = candidate.strip("#") + if test_tag(candidate.lower()): + found_tags.add(candidate.lower()) + to_replace.append(candidate) + if replacer: + tag_word = word + try: + for counter, replacee in enumerate(to_replace, 1): + tag_word = tag_word.replace("#%s" % replacee, replacer(replacee)) + except Exception: + pass + final_words.append(tag_word) else: final_words.append(word) else: From dfaa692ea4dcda1edb2a12a24306487b4ec3e026 Mon Sep 17 00:00:00 2001 From: Jason Robinson Date: Mon, 13 Apr 2020 13:22:12 +0300 Subject: [PATCH 5/6] Add test ensuring mention classes are not stripped By process_text_links text util. --- federation/tests/utils/test_text.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index 9073807..2efa917 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -106,6 +106,12 @@ class TestProcessTextLinks: assert process_text_links('#foobar') == \ '#foobar' + def test_does_not_remove_mention_classes(self): + assert process_text_links('

@jaywink boom

') == \ + '

@jaywink boom

' + def test_validate_handle(): assert validate_handle("foo@bar.com") From 9469101549e6218e63a9d43f6d3b42b25c5a0e95 Mon Sep 17 00:00:00 2001 From: Jason Robinson Date: Mon, 13 Apr 2020 21:28:22 +0300 Subject: [PATCH 6/6] Fix tag recognition if at start of HTML paragraph --- federation/tests/utils/test_text.py | 8 ++++++++ federation/utils/text.py | 6 +++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index 2efa917..fa69b6b 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -83,6 +83,14 @@ class TestFindTags: tags, text = find_tags(source, replacer=self._replacer) assert text == source + def test_start_of_paragraph_in_html_content(self): + source = '

First line

#foobar #barfoo

' + tags, text = find_tags(source) + assert tags == {"foobar", "barfoo"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == '

First line

#foobar/foobar #barfoo/barfoo

' + class TestProcessTextLinks: def test_link_at_start_or_end(self): diff --git a/federation/utils/text.py b/federation/utils/text.py index 9ad2cfe..cebed5a 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -33,7 +33,9 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]: Returns a set of tags and the original or replaced text. """ found_tags = set() - lines = text.splitlines(keepends=True) + #
and

tags cause issues in us finding words - add some spacing around them + new_text = text.replace("
", "
").replace("

", "

").replace("

", "

") + lines = new_text.splitlines(keepends=True) final_lines = [] code_block = False final_text = None @@ -78,6 +80,8 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]: final_lines.append(" ".join(final_words)) if replacer: final_text = "".join(final_lines) + if final_text: + final_text = final_text.replace("
", "
").replace("

", "

").replace("

", "

") return found_tags, final_text or text