Merge branch 'inbound-ap-html' into 'master'

Stop markdownifying received ActivityPub content

Closes socialhome/socialhome#198 and socialhome/socialhome#222

See merge request jaywink/federation!160
merge-requests/161/merge
jaywink 2020-04-13 18:33:18 +00:00
commit b4cc7071f4
9 zmienionych plików z 92 dodań i 58 usunięć

Wyświetl plik

@ -32,6 +32,10 @@
* Don't include OStatus for Mastodon 3.0+ protocols list. ([related issue](https://github.com/thefederationinfo/the-federation.info/issues/217))
* **Backwards incompatible**: Stop markdownifying incoming ActivityPub content. Instead
copy it as is to the ``raw_content`` attribute on the entity, setting also the
``_media_type`` to ``text/html``.
### Fixed
* Don't crash loudly when fetching webfinger for Diaspora that does not contain XML.
@ -59,6 +63,10 @@
* Don't try to relay AP payloads to Diaspora receivers and vice versa, for now, until cross-protocol
relaying is supported.
* Fix some characters stopping tags being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/222))
* Fix tags separated by slashes being identified ([related issue](https://git.feneas.org/socialhome/socialhome/-/issues/198))
## [0.19.0] - 2019-12-15

Wyświetl plik

@ -3,6 +3,8 @@ import re
import uuid
from typing import Dict, List
import bleach
from federation.entities.activitypub.constants import (
CONTEXTS_DEFAULT, CONTEXT_MANUALLY_APPROVES_FOLLOWERS, CONTEXT_SENSITIVE, CONTEXT_HASHTAG,
CONTEXT_LD_SIGNATURES)
@ -57,15 +59,19 @@ class CleanContentMixin(RawContentMixin):
"""
Make linkified tags normal tags.
"""
def cleaner(match):
return f"#{match.groups()[0]}"
super().post_receive()
self.raw_content = re.sub(
r'\[#([\w\-_]+)\]\(http?s://[a-zA-Z0-9/._-]+\)',
cleaner,
def remove_tag_links(attrs, new=False):
rel = (None, "rel")
if attrs.get(rel) == "tag":
return
return attrs
self.raw_content = bleach.linkify(
self.raw_content,
re.MULTILINE,
callbacks=[remove_tag_links],
parse_email=False,
skip_tags=["code", "pre"],
)

Wyświetl plik

@ -1,8 +1,6 @@
import logging
from typing import List, Callable, Dict, Union, Optional
from markdownify import markdownify
from federation.entities.activitypub.constants import NAMESPACE_PUBLIC
from federation.entities.activitypub.entities import (
ActivitypubFollow, ActivitypubProfile, ActivitypubAccept, ActivitypubPost, ActivitypubComment,
@ -259,19 +257,16 @@ def transform_attribute(
elif key == "attributedTo" and is_object:
transformed["actor_id"] = value
elif key in ("content", "source"):
if payload.get('source') and isinstance(payload.get("source"), dict):
if payload.get('source') and isinstance(payload.get("source"), dict) and \
payload.get('source').get('mediaType') == "text/markdown":
transformed["_media_type"] = "text/markdown"
transformed["raw_content"] = payload.get('source').get('content').strip()
transformed["_rendered_content"] = payload.get('content').strip()
if payload.get('source').get('mediaType') == "text/markdown":
transformed["_media_type"] = "text/markdown"
transformed["raw_content"] = payload.get('source').get('content').strip()
else:
transformed["raw_content"] = markdownify(payload.get('content').strip())
transformed["_media_type"] = payload.get('source').get('mediaType')
else:
transformed["raw_content"] = markdownify(payload.get('content').strip()).strip()
# Assume HTML by convention
transformed["_rendered_content"] = payload.get('content').strip()
transformed["_media_type"] = "text/html"
transformed["raw_content"] = payload.get('content').strip()
transformed["_rendered_content"] = transformed["raw_content"]
elif key == "endpoints" and isinstance(value, dict):
if "inboxes" not in transformed:
transformed["inboxes"] = {"private": None, "public": None}

Wyświetl plik

@ -409,10 +409,6 @@ class TestEntitiesPostReceive:
"public": False,
}]
def test_post__post_receive__cleans_linkified_tags(self, activitypubpost_linkified_tags):
activitypubpost_linkified_tags.post_receive()
assert activitypubpost_linkified_tags.raw_content == '<p>👁foobar</p><p>barfoo!<br>#fanart #mastoart</p>'
class TestEntitiesPreSend:
def test_post_inline_images_are_attached(self, activitypubpost_embedded_images):

Wyświetl plik

@ -67,7 +67,9 @@ class TestActivitypubEntityMappersReceive:
post = entities[0]
assert isinstance(post, ActivitypubPost)
assert isinstance(post, Post)
assert post.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom'
assert post.raw_content == '<p><span class="h-card"><a class="u-url mention" ' \
'href="https://dev.jasonrobinson.me/u/jaywink/">' \
'@<span>jaywink</span></a></span> boom</p>'
assert post.rendered_content == '<p><span class="h-card"><a href="https://dev.jasonrobinson.me/u/jaywink/" ' \
'class="u-url mention">@<span>jaywink</span></a></span> boom</p>'
assert post.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237"
@ -82,7 +84,7 @@ class TestActivitypubEntityMappersReceive:
post = entities[0]
assert isinstance(post, ActivitypubPost)
assert isinstance(post, Post)
assert post.raw_content == 'boom #test'
assert post.raw_content == '<p>boom #test</p>'
def test_message_to_objects_simple_post__with_mentions(self):
entities = message_to_objects(ACTIVITYPUB_POST_WITH_MENTIONS, "https://mastodon.social/users/jaywink")
@ -101,7 +103,9 @@ class TestActivitypubEntityMappersReceive:
assert isinstance(post, Post)
assert post.rendered_content == '<p><span class="h-card"><a href="https://dev.jasonrobinson.me/u/jaywink/" ' \
'class="u-url mention">@<span>jaywink</span></a></span> boom</p>'
assert post.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom\n\n'
assert post.raw_content == '<p><span class="h-card"><a class="u-url mention" ' \
'href="https://dev.jasonrobinson.me/u/jaywink/">' \
'@<span>jaywink</span></a></span> boom</p>'
def test_message_to_objects_simple_post__with_source__markdown(self):
entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_MARKDOWN, "https://diaspodon.fr/users/jaywink")
@ -141,7 +145,9 @@ class TestActivitypubEntityMappersReceive:
comment = entities[0]
assert isinstance(comment, ActivitypubComment)
assert isinstance(comment, Comment)
assert comment.raw_content == '[@jaywink](https://dev.jasonrobinson.me/u/jaywink/) boom'
assert comment.raw_content == '<p><span class="h-card"><a class="u-url mention" ' \
'href="https://dev.jasonrobinson.me/u/jaywink/">' \
'@<span>jaywink</span></a></span> boom</p>'
assert comment.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237"
assert comment.actor_id == "https://diaspodon.fr/users/jaywink"
assert comment.target_id == "https://dev.jasonrobinson.me/content/653bad70-41b3-42c9-89cb-c4ee587e68e4/"

Wyświetl plik

@ -4,7 +4,7 @@ from freezegun import freeze_time
from federation.entities.activitypub.entities import (
ActivitypubPost, ActivitypubAccept, ActivitypubFollow, ActivitypubProfile, ActivitypubComment,
ActivitypubRetraction, ActivitypubShare, ActivitypubImage)
from federation.entities.base import Profile, Image
from federation.entities.base import Profile
from federation.entities.diaspora.entities import (
DiasporaPost, DiasporaComment, DiasporaLike, DiasporaProfile, DiasporaRetraction,
DiasporaContact, DiasporaReshare,
@ -144,20 +144,6 @@ https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a902
)
@pytest.fixture
def activitypubpost_linkified_tags():
with freeze_time("2019-04-27"):
return ActivitypubPost(
raw_content='<p>👁foobar</p><p>barfoo!<br>[#fanart](https://mastodon.art/tags/fanart) '
'[#mastoart](https://mastodon.art/tags/mastoart)</p>',
public=True,
provider_display_name="Mastodon",
id=f"http://127.0.0.1:8000/post/123456/",
activity_id=f"http://127.0.0.1:8000/post/123456/#create",
actor_id=f"http://127.0.0.1:8000/profile/123456/",
)
@pytest.fixture
def activitypubprofile():
return ActivitypubProfile(

Wyświetl plik

@ -34,12 +34,13 @@ class TestFindTags:
assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n #alsocode\n"
def test_endings_are_filtered_out(self):
source = "#parenthesis) #exp! #list]"
source = "#parenthesis) #exp! #list] *#doh* _#bah_ #gah% #foo/#bar"
tags, text = find_tags(source)
assert tags == {"parenthesis", "exp", "list"}
assert tags == {"parenthesis", "exp", "list", "doh", "bah", "gah", "foo", "bar"}
assert text == source
tags, text = find_tags(source, replacer=self._replacer)
assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]"
assert text == "#parenthesis/parenthesis) #exp/exp! #list/list] *#doh/doh* _#bah/bah_ #gah/gah% " \
"#foo/foo/#bar/bar"
def test_finds_tags(self):
source = "#post **Foobar** #tag #OtherTag #third\n#fourth"
@ -49,6 +50,14 @@ class TestFindTags:
tags, text = find_tags(source, replacer=self._replacer)
assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth"
def test_ok_with_html_tags_in_text(self):
source = "<p>#starting and <span>#MixED</span> however not <#>this</#> or <#/>that"
tags, text = find_tags(source)
assert tags == {"starting", "mixed"}
assert text == source
tags, text = find_tags(source, replacer=self._replacer)
assert text == "<p>#starting/starting and <span>#MixED/mixed</span> however not <#>this</#> or <#/>that"
def test_postfixed_tags(self):
source = "#foo) #bar] #hoo, #hee."
tags, text = find_tags(source)
@ -66,7 +75,7 @@ class TestFindTags:
assert text == "(#foo/foo [#bar/bar"
def test_invalid_text_returns_no_tags(self):
source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \
source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a(a #a)a #a=a " \
"#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #aa #a”a #\xa0cd"
tags, text = find_tags(source)
assert tags == set()
@ -74,6 +83,14 @@ class TestFindTags:
tags, text = find_tags(source, replacer=self._replacer)
assert text == source
def test_start_of_paragraph_in_html_content(self):
source = '<p>First line</p><p>#foobar #barfoo</p>'
tags, text = find_tags(source)
assert tags == {"foobar", "barfoo"}
assert text == source
tags, text = find_tags(source, replacer=self._replacer)
assert text == '<p>First line</p><p>#foobar/foobar #barfoo/barfoo</p>'
class TestProcessTextLinks:
def test_link_at_start_or_end(self):
@ -97,6 +114,12 @@ class TestProcessTextLinks:
assert process_text_links('<a href="/streams/tag/foobar">#foobar</a>') == \
'<a href="/streams/tag/foobar">#foobar</a>'
def test_does_not_remove_mention_classes(self):
assert process_text_links('<p><span class="h-card"><a href="https://dev.jasonrobinson.me/u/jaywink/" '
'class="u-url mention">@<span>jaywink</span></a></span> boom</p>') == \
'<p><span class="h-card"><a class="u-url mention" href="https://dev.jasonrobinson.me/u/jaywink/" ' \
'rel="nofollow" target="_blank">@<span>jaywink</span></a></span> boom</p>'
def test_validate_handle():
assert validate_handle("foo@bar.com")

Wyświetl plik

@ -33,7 +33,9 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
Returns a set of tags and the original or replaced text.
"""
found_tags = set()
lines = text.splitlines(keepends=True)
# <br> and <p> tags cause issues in us finding words - add some spacing around them
new_text = text.replace("<br>", " <br> ").replace("<p>", " <p> ").replace("</p>", " </p> ")
lines = new_text.splitlines(keepends=True)
final_lines = []
code_block = False
final_text = None
@ -49,17 +51,28 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
# Check each word separately
words = line.split(" ")
for word in words:
candidate = word.strip().strip("([]),.!?:")
if candidate.startswith("#"):
candidate = candidate.strip("#")
if test_tag(candidate.lower()):
found_tags.add(candidate.lower())
if replacer:
try:
tag_word = word.replace("#%s" % candidate, replacer(candidate))
final_words.append(tag_word)
except Exception:
final_words.append(word)
if word.find('#') > -1:
candidate = word.strip().strip("([]),.!?:*_%/")
if candidate.find('<') > -1 or candidate.find('>') > -1:
# Strip html
candidate = bleach.clean(word, strip=True)
# Now split with slashes
candidates = candidate.split("/")
to_replace = []
for candidate in candidates:
if candidate.startswith("#"):
candidate = candidate.strip("#")
if test_tag(candidate.lower()):
found_tags.add(candidate.lower())
to_replace.append(candidate)
if replacer:
tag_word = word
try:
for counter, replacee in enumerate(to_replace, 1):
tag_word = tag_word.replace("#%s" % replacee, replacer(replacee))
except Exception:
pass
final_words.append(tag_word)
else:
final_words.append(word)
else:
@ -67,6 +80,8 @@ def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
final_lines.append(" ".join(final_words))
if replacer:
final_text = "".join(final_lines)
if final_text:
final_text = final_text.replace(" <br> ", "<br>").replace(" <p> ", "<p>").replace(" </p> ", "</p>")
return found_tags, final_text or text

Wyświetl plik

@ -36,7 +36,6 @@ setup(
"lxml>=3.4.0",
"ipdata>=3.0",
"iteration_utilities",
"markdownify",
"jsonschema>=2.0.0",
"pycryptodome>=3.4.10",
"python-dateutil>=2.4.0",