diff --git a/CHANGELOG.md b/CHANGELOG.md index a4c698f..842ce30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,10 @@ * Don't include OStatus for Mastodon 3.0+ protocols list. ([related issue](https://github.com/thefederationinfo/the-federation.info/issues/217)) +* **Backwards incompatible**: Stop markdownifying incoming ActivityPub content. Instead + copy it as is to the ``raw_content`` attribute on the entity, setting also the + ``_media_type`` to ``text/html``. + ### Fixed * Don't crash loudly when fetching webfinger for Diaspora that does not contain XML. @@ -59,6 +63,10 @@ * Don't try to relay AP payloads to Diaspora receivers and vice versa, for now, until cross-protocol relaying is supported. + +* Fix some characters stopping tags being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/222)) + +* Fix tags separated by slashes being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/198)) ## [0.19.0] - 2019-12-15 diff --git a/federation/entities/activitypub/entities.py b/federation/entities/activitypub/entities.py index d8163cb..0b88fee 100644 --- a/federation/entities/activitypub/entities.py +++ b/federation/entities/activitypub/entities.py @@ -3,6 +3,8 @@ import re import uuid from typing import Dict, List +import bleach + from federation.entities.activitypub.constants import ( CONTEXTS_DEFAULT, CONTEXT_MANUALLY_APPROVES_FOLLOWERS, CONTEXT_SENSITIVE, CONTEXT_HASHTAG, CONTEXT_LD_SIGNATURES) @@ -57,15 +59,19 @@ class CleanContentMixin(RawContentMixin): """ Make linkified tags normal tags. """ - def cleaner(match): - return f"#{match.groups()[0]}" - super().post_receive() - self.raw_content = re.sub( - r'\[#([\w\-_]+)\]\(http?s://[a-zA-Z0-9/._-]+\)', - cleaner, + + def remove_tag_links(attrs, new=False): + rel = (None, "rel") + if attrs.get(rel) == "tag": + return + return attrs + + self.raw_content = bleach.linkify( self.raw_content, - re.MULTILINE, + callbacks=[remove_tag_links], + parse_email=False, + skip_tags=["code", "pre"], ) diff --git a/federation/entities/activitypub/mappers.py b/federation/entities/activitypub/mappers.py index 4254f96..1506a48 100644 --- a/federation/entities/activitypub/mappers.py +++ b/federation/entities/activitypub/mappers.py @@ -1,8 +1,6 @@ import logging from typing import List, Callable, Dict, Union, Optional -from markdownify import markdownify - from federation.entities.activitypub.constants import NAMESPACE_PUBLIC from federation.entities.activitypub.entities import ( ActivitypubFollow, ActivitypubProfile, ActivitypubAccept, ActivitypubPost, ActivitypubComment, @@ -259,19 +257,16 @@ def transform_attribute( elif key == "attributedTo" and is_object: transformed["actor_id"] = value elif key in ("content", "source"): - if payload.get('source') and isinstance(payload.get("source"), dict): + if payload.get('source') and isinstance(payload.get("source"), dict) and \ + payload.get('source').get('mediaType') == "text/markdown": + transformed["_media_type"] = "text/markdown" + transformed["raw_content"] = payload.get('source').get('content').strip() transformed["_rendered_content"] = payload.get('content').strip() - if payload.get('source').get('mediaType') == "text/markdown": - transformed["_media_type"] = "text/markdown" - transformed["raw_content"] = payload.get('source').get('content').strip() - else: - transformed["raw_content"] = markdownify(payload.get('content').strip()) - transformed["_media_type"] = payload.get('source').get('mediaType') else: - transformed["raw_content"] = markdownify(payload.get('content').strip()).strip() # Assume HTML by convention - transformed["_rendered_content"] = payload.get('content').strip() transformed["_media_type"] = "text/html" + transformed["raw_content"] = payload.get('content').strip() + transformed["_rendered_content"] = transformed["raw_content"] elif key == "endpoints" and isinstance(value, dict): if "inboxes" not in transformed: transformed["inboxes"] = {"private": None, "public": None} diff --git a/federation/tests/entities/activitypub/test_entities.py b/federation/tests/entities/activitypub/test_entities.py index 87ec829..7ad874b 100644 --- a/federation/tests/entities/activitypub/test_entities.py +++ b/federation/tests/entities/activitypub/test_entities.py @@ -409,10 +409,6 @@ class TestEntitiesPostReceive: "public": False, }] - def test_post__post_receive__cleans_linkified_tags(self, activitypubpost_linkified_tags): - activitypubpost_linkified_tags.post_receive() - assert activitypubpost_linkified_tags.raw_content == '

👁️foobar

barfoo!
#fanart #mastoart

' - class TestEntitiesPreSend: def test_post_inline_images_are_attached(self, activitypubpost_embedded_images): diff --git a/federation/tests/entities/activitypub/test_mappers.py b/federation/tests/entities/activitypub/test_mappers.py index c50bfae..f8c5628 100644 --- a/federation/tests/entities/activitypub/test_mappers.py +++ b/federation/tests/entities/activitypub/test_mappers.py @@ -67,7 +67,9 @@ class TestActivitypubEntityMappersReceive: post = entities[0] assert isinstance(post, ActivitypubPost) assert isinstance(post, Post) - assert post.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom' + assert post.raw_content == '

' \ + '@jaywink boom

' assert post.rendered_content == '

@jaywink boom

' assert post.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237" @@ -82,7 +84,7 @@ class TestActivitypubEntityMappersReceive: post = entities[0] assert isinstance(post, ActivitypubPost) assert isinstance(post, Post) - assert post.raw_content == 'boom #test' + assert post.raw_content == '

boom #test

' def test_message_to_objects_simple_post__with_mentions(self): entities = message_to_objects(ACTIVITYPUB_POST_WITH_MENTIONS, "https://mastodon.social/users/jaywink") @@ -101,7 +103,9 @@ class TestActivitypubEntityMappersReceive: assert isinstance(post, Post) assert post.rendered_content == '

@jaywink boom

' - assert post.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom\n\n' + assert post.raw_content == '

' \ + '@jaywink boom

' def test_message_to_objects_simple_post__with_source__markdown(self): entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_MARKDOWN, "https://diaspodon.fr/users/jaywink") @@ -141,7 +145,9 @@ class TestActivitypubEntityMappersReceive: comment = entities[0] assert isinstance(comment, ActivitypubComment) assert isinstance(comment, Comment) - assert comment.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom' + assert comment.raw_content == '

' \ + '@jaywink boom

' assert comment.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237" assert comment.actor_id == "https://diaspodon.fr/users/jaywink" assert comment.target_id == "https://dev.jasonrobinson.me/content/653bad70-41b3-42c9-89cb-c4ee587e68e4/" diff --git a/federation/tests/fixtures/entities.py b/federation/tests/fixtures/entities.py index 317f66d..268f059 100644 --- a/federation/tests/fixtures/entities.py +++ b/federation/tests/fixtures/entities.py @@ -4,7 +4,7 @@ from freezegun import freeze_time from federation.entities.activitypub.entities import ( ActivitypubPost, ActivitypubAccept, ActivitypubFollow, ActivitypubProfile, ActivitypubComment, ActivitypubRetraction, ActivitypubShare, ActivitypubImage) -from federation.entities.base import Profile, Image +from federation.entities.base import Profile from federation.entities.diaspora.entities import ( DiasporaPost, DiasporaComment, DiasporaLike, DiasporaProfile, DiasporaRetraction, DiasporaContact, DiasporaReshare, @@ -144,20 +144,6 @@ https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a902 ) -@pytest.fixture -def activitypubpost_linkified_tags(): - with freeze_time("2019-04-27"): - return ActivitypubPost( - raw_content='

👁️foobar

barfoo!
[#fanart](https://mastodon.art/tags/fanart) ' - '[#mastoart](https://mastodon.art/tags/mastoart)

', - public=True, - provider_display_name="Mastodon", - id=f"http://127.0.0.1:8000/post/123456/", - activity_id=f"http://127.0.0.1:8000/post/123456/#create", - actor_id=f"http://127.0.0.1:8000/profile/123456/", - ) - - @pytest.fixture def activitypubprofile(): return ActivitypubProfile( diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index 6f0c1b1..fa69b6b 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -34,12 +34,13 @@ class TestFindTags: assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n #alsocode\n" def test_endings_are_filtered_out(self): - source = "#parenthesis) #exp! #list]" + source = "#parenthesis) #exp! #list] *#doh* _#bah_ #gah% #foo/#bar" tags, text = find_tags(source) - assert tags == {"parenthesis", "exp", "list"} + assert tags == {"parenthesis", "exp", "list", "doh", "bah", "gah", "foo", "bar"} assert text == source tags, text = find_tags(source, replacer=self._replacer) - assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]" + assert text == "#parenthesis/parenthesis) #exp/exp! #list/list] *#doh/doh* _#bah/bah_ #gah/gah% " \ + "#foo/foo/#bar/bar" def test_finds_tags(self): source = "#post **Foobar** #tag #OtherTag #third\n#fourth" @@ -49,6 +50,14 @@ class TestFindTags: tags, text = find_tags(source, replacer=self._replacer) assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth" + def test_ok_with_html_tags_in_text(self): + source = "

#starting and #MixED however not <#>this or <#/>that" + tags, text = find_tags(source) + assert tags == {"starting", "mixed"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "

#starting/starting and #MixED/mixed however not <#>this or <#/>that" + def test_postfixed_tags(self): source = "#foo) #bar] #hoo, #hee." tags, text = find_tags(source) @@ -66,7 +75,7 @@ class TestFindTags: assert text == "(#foo/foo [#bar/bar" def test_invalid_text_returns_no_tags(self): - source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \ + source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a(a #a)a #a=a " \ "#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd" tags, text = find_tags(source) assert tags == set() @@ -74,6 +83,14 @@ class TestFindTags: tags, text = find_tags(source, replacer=self._replacer) assert text == source + def test_start_of_paragraph_in_html_content(self): + source = '

First line

#foobar #barfoo

' + tags, text = find_tags(source) + assert tags == {"foobar", "barfoo"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == '

First line

#foobar/foobar #barfoo/barfoo

' + class TestProcessTextLinks: def test_link_at_start_or_end(self): @@ -97,6 +114,12 @@ class TestProcessTextLinks: assert process_text_links('#foobar') == \ '#foobar' + def test_does_not_remove_mention_classes(self): + assert process_text_links('

@jaywink boom

') == \ + '

@jaywink boom

' + def test_validate_handle(): assert validate_handle("foo@bar.com") diff --git a/federation/utils/text.py b/federation/utils/text.py index 8bfb02b..cebed5a 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -33,7 +33,9 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]: Returns a set of tags and the original or replaced text. """ found_tags = set() - lines = text.splitlines(keepends=True) + #
and

tags cause issues in us finding words - add some spacing around them + new_text = text.replace("
", "
").replace("

", "

").replace("

", "

") + lines = new_text.splitlines(keepends=True) final_lines = [] code_block = False final_text = None @@ -49,17 +51,28 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]: # Check each word separately words = line.split(" ") for word in words: - candidate = word.strip().strip("([]),.!?:") - if candidate.startswith("#"): - candidate = candidate.strip("#") - if test_tag(candidate.lower()): - found_tags.add(candidate.lower()) - if replacer: - try: - tag_word = word.replace("#%s" % candidate, replacer(candidate)) - final_words.append(tag_word) - except Exception: - final_words.append(word) + if word.find('#') > -1: + candidate = word.strip().strip("([]),.!?:*_%/") + if candidate.find('<') > -1 or candidate.find('>') > -1: + # Strip html + candidate = bleach.clean(word, strip=True) + # Now split with slashes + candidates = candidate.split("/") + to_replace = [] + for candidate in candidates: + if candidate.startswith("#"): + candidate = candidate.strip("#") + if test_tag(candidate.lower()): + found_tags.add(candidate.lower()) + to_replace.append(candidate) + if replacer: + tag_word = word + try: + for counter, replacee in enumerate(to_replace, 1): + tag_word = tag_word.replace("#%s" % replacee, replacer(replacee)) + except Exception: + pass + final_words.append(tag_word) else: final_words.append(word) else: @@ -67,6 +80,8 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]: final_lines.append(" ".join(final_words)) if replacer: final_text = "".join(final_lines) + if final_text: + final_text = final_text.replace("
", "
").replace("

", "

").replace("

", "

") return found_tags, final_text or text diff --git a/setup.py b/setup.py index c59182d..0757dee 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,6 @@ setup( "lxml>=3.4.0", "ipdata>=3.0", "iteration_utilities", - "markdownify", "jsonschema>=2.0.0", "pycryptodome>=3.4.10", "python-dateutil>=2.4.0",