Porównaj commity

...

16 Commity

Autor SHA1 Wiadomość Data
Alain St-Denis 1f15583aad Merge branch 'todos-and-issues' into 'master'
Fixes addressing various manually tracked content issues.

See merge request jaywink/federation!183
2024-02-29 00:58:03 +00:00
Alain St-Denis 3dfe7d637b use markdownify instead of as md. 2024-02-26 20:04:12 -05:00
Alain St-Denis e992e2dc20 Update CHANGELOG. 2024-02-20 10:15:49 -05:00
Alain St-Denis 2ee17e4aa6 Merge branch 'master' into todos-and-issues 2024-02-20 08:31:28 -05:00
Alain St-Denis 8a4863fcd3 Merge branch 'fix-CVE-2024-23832' into todos-and-issues 2024-02-16 11:50:42 -05:00
Alain St-Denis 345a0c0ac3 Handle cases where a nested AP field contains only one undefined object. 2024-02-15 10:32:33 -05:00
Alain St-Denis 2672eede39 Also remove trailing garbage from hashtag text found in the rendered content. 2024-02-13 17:56:18 -05:00
Alain St-Denis 21184c368a Increase the Unicode character range allowed in Diaspora mentions. 2024-02-13 16:57:56 -05:00
Alain St-Denis d080bcf509 handle escaped characters in markdown mentions 2024-02-13 14:04:15 -05:00
Alain St-Denis 2509692041 Mark an AP mention only if profile.finger is defined. 2024-02-12 08:53:10 -05:00
Alain St-Denis f1bb3544fa Improve webfinger handling of AP application type. 2024-02-11 11:26:16 -05:00
Alain St-Denis 47bf0f579d Nested fields: handle unknown json-ld types more gracefully. 2024-02-11 10:23:52 -05:00
Alain St-Denis 3f98f1e04e Select the img tag title property over the alt property for embedded images. Adjust the corresponding test. 2024-02-11 08:33:32 -05:00
Alain St-Denis b190626bb9 fix image duplication caused by both an img tag and a Image object for the same image are defined in a payload. 2024-02-09 18:09:49 -05:00
Alain St-Denis 4868291747 Ignore media objects that don't define a media type. 2024-02-09 15:41:26 -05:00
Alain St-Denis 64044e7452 when some Mention objects hrefs can't be found, try with the name property 2024-02-03 09:17:07 -05:00
7 zmienionych plików z 89 dodań i 21 usunięć

Wyświetl plik

@ -1,5 +1,41 @@
# Changelog
## Unreleased
### Changed
* This is actually both a change and a fix. AP Image objects do not define properties matching the
HTML img tag alt and title properties. Image.name is used to render both alt and title, which IMHO is
wrong. With this change, markdown images defining the title property will be recognized instead of
being thrown away (the fix) and the title property, if defined, will have precedence over the
alt property as the Image.name value (the change). Before this change, the client app would properly
render the img tag from the markdown source (with distinct alt and title properties), but the Image
object would not federate and hence not be displayed on other platforms (namely Mastodon).
### Fixed
* Note._find_and_mark_mentions: When an AP Mention object href can't be found in the rendered content,
try the name property.
* Ignore media objects that don't define a media type.
* Prevent rendered content image duplication when an image is both in the AP payload rendered content
and defined as an attachment that doesn't set the inlineImage property.
* Instead of discarding the whole AP payload out when encountering an undefined or unlisted AP object,
log a warning and keep going. Ensure None is returned when a nested field only contains an undefined
object.
* Accept the application/ld+json type for webfinger AP links.
* Mark an AP mention only if profile.finger is defined.
* Handle escape sequences for inbound markdown mentions.
* Extend the Unicode character range allowed in markdown mentions.
* Discard illegal characters from tag text. Previously, this was done only on tag links.
## [0.25.1] - 2024-02-18
### Fixed

Wyświetl plik

@ -209,11 +209,16 @@ class MixedField(fields.Nested):
ret = []
for item in value:
if item.get('@type'):
res = super()._deserialize(item, attr, data, **kwargs)
try:
res = super()._deserialize(item, attr, data, **kwargs)
except KeyError as ex:
logger.warning("nested field: undefined JSON-LD type %s", ex)
continue
ret.append(res if not isinstance(res, list) else res[0])
else:
ret.append(self.iri._deserialize(item, attr, data, **kwargs))
if not ret: ret.append(None)
return ret if len(ret) > 1 or self.many else ret[0]
@ -247,7 +252,7 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation):
icon = MixedField(as2.icon, nested='ImageSchema')
image = MixedField(as2.image, nested='ImageSchema')
tag_objects = MixedField(as2.tag, nested=['NoteSchema', 'HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True)
attachment = fields.Nested(as2.attachment, nested=['LinkSchema', 'NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'],
attachment = MixedField(as2.attachment, nested=['LinkSchema', 'NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'],
many=True, default=[])
content_map = LanguageMap(as2.content) # language maps are not implemented in calamus
context = fields.RawJsonLD(as2.context)
@ -421,6 +426,8 @@ class Document(Object):
url = MixedField(as2.url, nested='LinkSchema')
def to_base(self):
if self.media_type is missing:
return self
self.__dict__.update({'schema': True})
if self.media_type.startswith('image'):
return Image(**get_base_attributes(self))
@ -866,7 +873,7 @@ class Note(Object, RawContentMixin):
normalized_url = f'{parsed.scheme}://{parsed.netloc}{normalized_path.decode()}'
links = {link['href'].lower(), unquote(link['href']).lower(), url, normalized_url}
if links.intersection(hrefs):
tag = re.match(r'^#?([\w\-]+$)', link.text)
tag = re.match(r'^#?([\w\-]+)', link.text)
if tag:
link['data-hashtag'] = tag.group(1).lower()
@ -879,17 +886,28 @@ class Note(Object, RawContentMixin):
for mention in mentions:
hrefs = []
profile = get_profile_or_entity(fid=mention.href, remote_url=mention.href)
if profile and not profile.url:
# This should be removed when we are confident that the remote_url property
# has been populated for most profiles on the client app side.
if profile and not (profile.url and profile.finger):
# This should be removed when we are confident that the remote_url and
# finger properties have been populated for most profiles on the client app side.
profile = retrieve_and_parse_profile(profile.id)
if profile:
if profile and profile.finger:
hrefs.extend([profile.id, profile.url])
else:
continue
for href in hrefs:
links = self._soup.find_all(href=href)
for link in links:
link['data-mention'] = profile.finger
self._mentions.add(profile.finger)
if profile.finger not in self._mentions:
# can't find some mentions using their href property value
# try with the name property
matches = self._soup.find_all(string=mention.name)
for match in matches:
link = match.find_parent('a')
if link:
link['data-mention'] = profile.finger
self._mentions.add(profile.finger)
def extract_mentions(self):
"""
@ -953,7 +971,7 @@ class Note(Object, RawContentMixin):
if hasattr(child, 'to_base'):
child = child.to_base()
if isinstance(child, Image):
if child.inline or (child.image and child.image in self.raw_content):
if child.inline or self._soup.find('img', src=child.url):
continue
children.append(child)
self._cached_children = children

Wyświetl plik

@ -6,6 +6,7 @@ from typing import List, Set, Union, Dict, Tuple
from bs4 import BeautifulSoup
from commonmark import commonmark
from markdownify import markdownify
from marshmallow import missing
from federation.entities.activitypub.enums import ActivityType
@ -224,13 +225,17 @@ class RawContentMixin(BaseEntity):
Returns a Tuple of (url, filename).
"""
images = []
if self._media_type != "text/markdown" or self.raw_content is None:
return images
regex = r"!\[([\w\s\-\']*)\]\((https?://[\w\d\-\./]+\.[\w]*((?<=jpg)|(?<=gif)|(?<=png)|(?<=jpeg)))\)"
matches = re.finditer(regex, self.raw_content, re.MULTILINE | re.IGNORECASE)
for match in matches:
groups = match.groups()
images.append((groups[1], groups[0] or ""))
if hasattr(self, '_soup'):
for img in self._soup.find_all('img', src=re.compile(r'^http')):
images.append((img['src'], img.get('title', '') or img.get('alt', '')))
else:
if self._media_type != "text/markdown" or self.raw_content is None:
return images
regex = r"!\[([\w\s\-\']*)\]\((https?://[\w\d\-\./]+\.[\w]*((?<=jpg)|(?<=gif)|(?<=png)|(?<=jpeg)))\)"
matches = re.finditer(regex, self.raw_content, re.MULTILINE | re.IGNORECASE)
for match in matches:
groups = match.groups()
images.append((groups[1], groups[0] or ""))
return images
# Legacy. Keep this until tests are reworked
@ -258,6 +263,9 @@ class RawContentMixin(BaseEntity):
if handle:
self._mentions.add(handle)
self.raw_content = self.raw_content.replace(mention, '@' + handle)
# mardownify the extracted mention in case some characters are escaped in
# raw_content
self.raw_content = self.raw_content.replace(markdownify(mention), '@' + handle)
class OptionalRawContentMixin(RawContentMixin):

Wyświetl plik

@ -1,5 +1,6 @@
import pytest
# noinspection PyPackageRequirements
from commonmark import commonmark
from freezegun import freeze_time
from unittest.mock import patch
@ -152,8 +153,7 @@ def activitypubpost_tags():
@pytest.fixture
def activitypubpost_embedded_images():
with freeze_time("2019-04-27"):
obj = models.Post(
raw_content="""
raw_content="""
#Cycling #lauttasaari #sea #sun
@ -166,7 +166,10 @@ def activitypubpost_embedded_images():
[foo](https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a9025414710.jpg)
#only a link, not embedded
https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a9025414711.jpg
""",
"""
obj = models.Post(
raw_content=raw_content,
rendered_content=commonmark(raw_content, ignore_html_blocks=True),
public=True,
provider_display_name="Socialhome",
id=f"http://127.0.0.1:8000/post/123456/",

Wyświetl plik

@ -1,5 +1,6 @@
import json
import logging
import re
from typing import Optional, Any
from urllib.parse import urlparse
@ -16,6 +17,7 @@ except Exception as exc:
federation_user = None
logger.warning("django is required for get requests signing: %s", exc)
type_path = re.compile(r'^application/(activity|ld)\+json')
def get_profile_id_from_webfinger(handle: str) -> Optional[str]:
"""
@ -30,7 +32,7 @@ def get_profile_id_from_webfinger(handle: str) -> Optional[str]:
except json.JSONDecodeError:
return
for link in doc.get("links", []):
if link.get("rel") == "self" and link.get("type") == "application/activity+json":
if link.get("rel") == "self" and type_path.match(link.get("type")):
return link["href"]
logger.debug("get_profile_id_from_webfinger: found webfinger but it has no as2 self href")

Wyświetl plik

@ -8,8 +8,8 @@ from commonmark import commonmark
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE)
# This will match non matching braces. I don't think it's an issue.
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
# This will match non-matching braces. I don't think it's an issue.
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u0250-\U0001f64f]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
# based on https://stackoverflow.com/a/6041965
URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|^|(?<=[("<\s]))+(?:[\w\-]+(?:(?:\.[\w\-]+)+))'
r'[\w.,;:@?!$()*^=%&/~+\-#]*(?<![:;,).>"]))',

Wyświetl plik

@ -37,6 +37,7 @@ setup(
"lxml>=3.4.0",
"iteration_utilities",
"jsonschema>=2.0.0",
"markdownify",
"pycryptodome>=3.4.10",
"python-dateutil>=2.4.0",
"python-httpsig-socialhome",