From 64044e745255443fd32843e33114959b26d82226 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sat, 3 Feb 2024 09:17:07 -0500 Subject: [PATCH 01/13] when some Mention objects hrefs can't be found, try with the name property --- federation/entities/activitypub/models.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 176ba97..57408c3 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -885,11 +885,22 @@ class Note(Object, RawContentMixin): profile = retrieve_and_parse_profile(profile.id) if profile: hrefs.extend([profile.id, profile.url]) + else: + continue for href in hrefs: links = self._soup.find_all(href=href) for link in links: link['data-mention'] = profile.finger self._mentions.add(profile.finger) + if profile.finger not in self._mentions: + # can't find some mentions using their href property value + # try with the name property + matches = self._soup.find_all(string=mention.name) + for match in matches: + link = match.find_parent('a') + if link: + link['data-mention'] = profile.finger + self._mentions.add(profile.finger) def extract_mentions(self): """ From 4868291747584fc95704503bda628685691b4b42 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Fri, 9 Feb 2024 15:41:26 -0500 Subject: [PATCH 02/13] Ignore media objects that don't define a media type. --- federation/entities/activitypub/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 57408c3..11e7798 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -421,6 +421,8 @@ class Document(Object): url = MixedField(as2.url, nested='LinkSchema') def to_base(self): + if self.media_type is missing: + return self self.__dict__.update({'schema': True}) if self.media_type.startswith('image'): return Image(**get_base_attributes(self)) From b190626bb9d0a52d7b3e71e47e4ead9989cd1d6e Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Fri, 9 Feb 2024 18:09:49 -0500 Subject: [PATCH 03/13] fix image duplication caused by both an img tag and a Image object for the same image are defined in a payload. --- federation/entities/activitypub/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 11e7798..082f5e7 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -966,7 +966,7 @@ class Note(Object, RawContentMixin): if hasattr(child, 'to_base'): child = child.to_base() if isinstance(child, Image): - if child.inline or (child.image and child.image in self.raw_content): + if child.inline or self._soup.find('img', src=child.url): continue children.append(child) self._cached_children = children From 3f98f1e04e6a41ba670b135a1bb720878a8a2eb4 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sun, 11 Feb 2024 08:33:32 -0500 Subject: [PATCH 04/13] Select the img tag title property over the alt property for embedded images. Adjust the corresponding test. --- federation/entities/mixins.py | 18 +++++++++++------- federation/tests/fixtures/entities.py | 9 ++++++--- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py index d37fd93..8855011 100644 --- a/federation/entities/mixins.py +++ b/federation/entities/mixins.py @@ -224,13 +224,17 @@ class RawContentMixin(BaseEntity): Returns a Tuple of (url, filename). """ images = [] - if self._media_type != "text/markdown" or self.raw_content is None: - return images - regex = r"!\[([\w\s\-\']*)\]\((https?://[\w\d\-\./]+\.[\w]*((?<=jpg)|(?<=gif)|(?<=png)|(?<=jpeg)))\)" - matches = re.finditer(regex, self.raw_content, re.MULTILINE | re.IGNORECASE) - for match in matches: - groups = match.groups() - images.append((groups[1], groups[0] or "")) + if hasattr(self, '_soup'): + for img in self._soup.find_all('img', src=re.compile(r'^http')): + images.append((img['src'], img.get('title', '') or img.get('alt', ''))) + else: + if self._media_type != "text/markdown" or self.raw_content is None: + return images + regex = r"!\[([\w\s\-\']*)\]\((https?://[\w\d\-\./]+\.[\w]*((?<=jpg)|(?<=gif)|(?<=png)|(?<=jpeg)))\)" + matches = re.finditer(regex, self.raw_content, re.MULTILINE | re.IGNORECASE) + for match in matches: + groups = match.groups() + images.append((groups[1], groups[0] or "")) return images # Legacy. Keep this until tests are reworked diff --git a/federation/tests/fixtures/entities.py b/federation/tests/fixtures/entities.py index e555a97..7f48b43 100644 --- a/federation/tests/fixtures/entities.py +++ b/federation/tests/fixtures/entities.py @@ -1,5 +1,6 @@ import pytest # noinspection PyPackageRequirements +from commonmark import commonmark from freezegun import freeze_time from unittest.mock import patch @@ -152,8 +153,7 @@ def activitypubpost_tags(): @pytest.fixture def activitypubpost_embedded_images(): with freeze_time("2019-04-27"): - obj = models.Post( - raw_content=""" + raw_content=""" #Cycling #lauttasaari #sea #sun @@ -166,7 +166,10 @@ def activitypubpost_embedded_images(): [foo](https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a9025414710.jpg) #only a link, not embedded https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a9025414711.jpg -""", +""" + obj = models.Post( + raw_content=raw_content, + rendered_content=commonmark(raw_content, ignore_html_blocks=True), public=True, provider_display_name="Socialhome", id=f"http://127.0.0.1:8000/post/123456/", From 47bf0f579d4d5785042611f6ef89d6870f57ee90 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sun, 11 Feb 2024 10:23:52 -0500 Subject: [PATCH 05/13] Nested fields: handle unknown json-ld types more gracefully. --- federation/entities/activitypub/models.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 082f5e7..cb95320 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -209,7 +209,11 @@ class MixedField(fields.Nested): ret = [] for item in value: if item.get('@type'): - res = super()._deserialize(item, attr, data, **kwargs) + try: + res = super()._deserialize(item, attr, data, **kwargs) + except KeyError as ex: + logger.warning("nested field: undefined JSON-LD type %s", ex) + continue ret.append(res if not isinstance(res, list) else res[0]) else: ret.append(self.iri._deserialize(item, attr, data, **kwargs)) @@ -247,7 +251,7 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation): icon = MixedField(as2.icon, nested='ImageSchema') image = MixedField(as2.image, nested='ImageSchema') tag_objects = MixedField(as2.tag, nested=['NoteSchema', 'HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True) - attachment = fields.Nested(as2.attachment, nested=['LinkSchema', 'NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'], + attachment = MixedField(as2.attachment, nested=['LinkSchema', 'NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'], many=True, default=[]) content_map = LanguageMap(as2.content) # language maps are not implemented in calamus context = fields.RawJsonLD(as2.context) From f1bb3544fa4a6d578715101283e89a541a6b20d7 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sun, 11 Feb 2024 11:26:16 -0500 Subject: [PATCH 06/13] Improve webfinger handling of AP application type. --- federation/utils/activitypub.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/federation/utils/activitypub.py b/federation/utils/activitypub.py index 62404cc..5eb25df 100644 --- a/federation/utils/activitypub.py +++ b/federation/utils/activitypub.py @@ -1,5 +1,6 @@ import json import logging +import re from typing import Optional, Any from federation.protocols.activitypub.signing import get_http_authentication @@ -15,6 +16,7 @@ except Exception as exc: federation_user = None logger.warning("django is required for get requests signing: %s", exc) +type_path = re.compile(r'^application/(activity|ld)\+json') def get_profile_id_from_webfinger(handle: str) -> Optional[str]: """ @@ -29,7 +31,7 @@ def get_profile_id_from_webfinger(handle: str) -> Optional[str]: except json.JSONDecodeError: return for link in doc.get("links", []): - if link.get("rel") == "self" and link.get("type") == "application/activity+json": + if link.get("rel") == "self" and type_path.match(link.get("type")): return link["href"] logger.debug("get_profile_id_from_webfinger: found webfinger but it has no as2 self href") From 2509692041c5134671d28644bc38201ebf73aa02 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Mon, 12 Feb 2024 08:53:10 -0500 Subject: [PATCH 07/13] Mark an AP mention only if profile.finger is defined. --- federation/entities/activitypub/models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index cb95320..aac1819 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -885,11 +885,11 @@ class Note(Object, RawContentMixin): for mention in mentions: hrefs = [] profile = get_profile_or_entity(fid=mention.href, remote_url=mention.href) - if profile and not profile.url: - # This should be removed when we are confident that the remote_url property - # has been populated for most profiles on the client app side. + if profile and not (profile.url and profile.finger): + # This should be removed when we are confident that the remote_url and + # finger properties have been populated for most profiles on the client app side. profile = retrieve_and_parse_profile(profile.id) - if profile: + if profile and profile.finger: hrefs.extend([profile.id, profile.url]) else: continue From d080bcf5093868068f0922f0df9341b85c9c96d8 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Tue, 13 Feb 2024 14:04:15 -0500 Subject: [PATCH 08/13] handle escaped characters in markdown mentions --- federation/entities/mixins.py | 4 ++++ setup.py | 1 + 2 files changed, 5 insertions(+) diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py index 8855011..70a68bd 100644 --- a/federation/entities/mixins.py +++ b/federation/entities/mixins.py @@ -6,6 +6,7 @@ from typing import List, Set, Union, Dict, Tuple from bs4 import BeautifulSoup from commonmark import commonmark +from markdownify import markdownify as md from marshmallow import missing from federation.entities.activitypub.enums import ActivityType @@ -262,6 +263,9 @@ class RawContentMixin(BaseEntity): if handle: self._mentions.add(handle) self.raw_content = self.raw_content.replace(mention, '@' + handle) + # mardownify the extracted mention in case some characters are escaped in + # raw_content + self.raw_content = self.raw_content.replace(md(mention), '@' + handle) class OptionalRawContentMixin(RawContentMixin): diff --git a/setup.py b/setup.py index a28fc6f..e2fae23 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ setup( "lxml>=3.4.0", "iteration_utilities", "jsonschema>=2.0.0", + "markdownify", "pycryptodome>=3.4.10", "python-dateutil>=2.4.0", "python-httpsig-socialhome", From 21184c368a8a50e4bf1cc9ad25075a3ee9c15f66 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Tue, 13 Feb 2024 16:57:56 -0500 Subject: [PATCH 09/13] Increase the Unicode character range allowed in Diaspora mentions. --- federation/utils/text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/federation/utils/text.py b/federation/utils/text.py index bf1f235..063b5ad 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -8,8 +8,8 @@ from commonmark import commonmark ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0" TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE) -# This will match non matching braces. I don't think it's an issue. -MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE) +# This will match non-matching braces. I don't think it's an issue. +MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u0250-\U0001f64f]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE) # based on https://stackoverflow.com/a/6041965 URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|^|(?<=[("<\s]))+(?:[\w\-]+(?:(?:\.[\w\-]+)+))' r'[\w.,;:@?!$()*^=%&/~+\-#]*(?"]))', From 2672eede3972769bad22f28b9be878fc68230c17 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Tue, 13 Feb 2024 17:56:18 -0500 Subject: [PATCH 10/13] Also remove trailing garbage from hashtag text found in the rendered content. --- federation/entities/activitypub/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index aac1819..08c727f 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -872,7 +872,7 @@ class Note(Object, RawContentMixin): normalized_url = f'{parsed.scheme}://{parsed.netloc}{normalized_path.decode()}' links = {link['href'].lower(), unquote(link['href']).lower(), url, normalized_url} if links.intersection(hrefs): - tag = re.match(r'^#?([\w\-]+$)', link.text) + tag = re.match(r'^#?([\w\-]+)', link.text) if tag: link['data-hashtag'] = tag.group(1).lower() From 345a0c0ac303bbf5eb4a2742b08893053f53ebc9 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Thu, 15 Feb 2024 10:32:33 -0500 Subject: [PATCH 11/13] Handle cases where a nested AP field contains only one undefined object. --- federation/entities/activitypub/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 08c727f..ac48cc3 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -218,6 +218,7 @@ class MixedField(fields.Nested): else: ret.append(self.iri._deserialize(item, attr, data, **kwargs)) + if not ret: ret.append(None) return ret if len(ret) > 1 or self.many else ret[0] From e992e2dc20aa292c3c2aa1e8520f1d2092030c20 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Tue, 20 Feb 2024 10:15:49 -0500 Subject: [PATCH 12/13] Update CHANGELOG. --- CHANGELOG.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 23a514a..2ec05ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,41 @@ # Changelog +## Unreleased + +### Changed + +* This is actually both a change and a fix. AP Image objects do not define properties matching the + HTML img tag alt and title properties. Image.name is used to render both alt and title, which IMHO is + wrong. With this change, markdown images defining the title property will be recognized instead of + being thrown away (the fix) and the title property, if defined, will have precedence over the + alt property as the Image.name value (the change). Before this change, the client app would properly + render the img tag from the markdown source (with distinct alt and title properties), but the Image + object would not federate and hence not be displayed on other platforms (namely Mastodon). + +### Fixed + +* Note._find_and_mark_mentions: When an AP Mention object href can't be found in the rendered content, + try the name property. + +* Ignore media objects that don't define a media type. + +* Prevent rendered content image duplication when an image is both in the AP payload rendered content + and defined as an attachment that doesn't set the inlineImage property. + +* Instead of discarding the whole AP payload out when encountering an undefined or unlisted AP object, + log a warning and keep going. Ensure None is returned when a nested field only contains an undefined + object. + +* Accept the application/ld+json type for webfinger AP links. + +* Mark an AP mention only if profile.finger is defined. + +* Handle escape sequences for inbound markdown mentions. + +* Extend the Unicode character range allowed in markdown mentions. + +* Discard illegal characters from tag text. Previously, this was done only on tag links. + ## [0.25.1] - 2024-02-18 ### Fixed From 3dfe7d637b0579c482a3b125fccb61923afdd721 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Mon, 26 Feb 2024 20:04:12 -0500 Subject: [PATCH 13/13] use markdownify instead of as md. --- federation/entities/mixins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py index 70a68bd..32c6cdd 100644 --- a/federation/entities/mixins.py +++ b/federation/entities/mixins.py @@ -6,7 +6,7 @@ from typing import List, Set, Union, Dict, Tuple from bs4 import BeautifulSoup from commonmark import commonmark -from markdownify import markdownify as md +from markdownify import markdownify from marshmallow import missing from federation.entities.activitypub.enums import ActivityType @@ -265,7 +265,7 @@ class RawContentMixin(BaseEntity): self.raw_content = self.raw_content.replace(mention, '@' + handle) # mardownify the extracted mention in case some characters are escaped in # raw_content - self.raw_content = self.raw_content.replace(md(mention), '@' + handle) + self.raw_content = self.raw_content.replace(markdownify(mention), '@' + handle) class OptionalRawContentMixin(RawContentMixin):