Add language support to posts

pull/579/head
Christof Dorner 2023-05-15 12:08:11 +02:00
rodzic 32216315aa
commit 808838707a
7 zmienionych plików z 104 dodań i 3 usunięć

Wyświetl plik

@ -0,0 +1,18 @@
# Generated by Django 4.2.1 on 2023-05-15 09:26
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("activities", "0016_index_together_migration"),
]
operations = [
migrations.AddField(
model_name="post",
name="language",
field=models.CharField(max_length=2, null=True),
),
]

Wyświetl plik

@ -31,6 +31,7 @@ from core.html import ContentRenderer, FediverseHtmlParser
from core.ld import (
canonicalise,
format_ld_date,
get_language,
get_list,
get_value_or_map,
parse_ld_date,
@ -252,6 +253,9 @@ class Post(StatorModel):
# The main (HTML) content
content = models.TextField()
# The language of the content
language = models.CharField(max_length=2, null=True)
type = models.CharField(
max_length=20,
choices=Types.choices,
@ -474,6 +478,7 @@ class Post(StatorModel):
reply_to: Optional["Post"] = None,
attachments: list | None = None,
question: dict | None = None,
language: str | None = None,
) -> "Post":
with transaction.atomic():
# Find mentions in this post
@ -492,6 +497,9 @@ class Post(StatorModel):
sorted([tag[: Hashtag.MAXIMUM_LENGTH] for tag in parser.hashtags])
or None
)
if language is None:
language = author.config_identity.preferred_posting_language
# Make the Post object
post = cls.objects.create(
author=author,
@ -502,6 +510,7 @@ class Post(StatorModel):
visibility=visibility,
hashtags=hashtags,
in_reply_to=reply_to.object_uri if reply_to else None,
language=language,
)
post.object_uri = post.urls.object_uri
post.url = post.absolute_object_uri()
@ -526,6 +535,7 @@ class Post(StatorModel):
visibility: int = Visibilities.public,
attachments: list | None = None,
attachment_attributes: list | None = None,
language: str | None = None,
):
with transaction.atomic():
# Strip all HTML and apply linebreaks filter
@ -538,6 +548,9 @@ class Post(StatorModel):
self.summary = summary or None
self.sensitive = bool(summary) if sensitive is None else sensitive
self.visibility = visibility
if language is None:
language = self.author.config_identity.preferred_posting_language
self.language = language
self.edited = timezone.now()
self.mentions.set(self.mentions_from_content(content, self.author))
self.emojis.set(Emoji.emojis_from_content(content, None))
@ -649,6 +662,10 @@ class Post(StatorModel):
"tag": [],
"attachment": [],
}
if self.language is not None:
value["contentMap"] = {
self.language: value["content"],
}
if self.type == Post.Types.question and self.type_data:
value[self.type_data.mode] = [
{
@ -872,6 +889,7 @@ class Post(StatorModel):
post.published = parse_ld_date(data.get("published"))
post.edited = parse_ld_date(data.get("updated"))
post.in_reply_to = data.get("inReplyTo")
post.language = get_language(data)
# Mentions and hashtags
post.hashtags = []
for tag in get_list(data, "tag"):
@ -1112,6 +1130,7 @@ class Post(StatorModel):
"created_at": format_ld_date(self.published),
"account": self.author.to_mastodon_json(include_counts=False),
"content": self.safe_content_remote(),
"language": self.language,
"visibility": visibility_mapping[self.visibility],
"sensitive": self.sensitive,
"spoiler_text": self.summary or "",
@ -1152,7 +1171,6 @@ class Post(StatorModel):
if isinstance(self.type_data, QuestionData)
else None,
"card": None,
"language": None,
"text": self.safe_content_remote(),
"edited_at": format_ld_date(self.edited) if self.edited else None,
}

Wyświetl plik

@ -151,7 +151,7 @@ class Status(Schema):
reblog: Optional["Status"] = Field(...)
poll: Poll | None = Field(...)
card: None = Field(...)
language: None = Field(...)
language: str | None = Field(...)
text: str | None = Field(...)
edited_at: str | None
favourited: bool = False

Wyświetl plik

@ -110,6 +110,7 @@ def post_status(request, details: PostStatusSchema) -> schemas.Status:
reply_to=reply_post,
attachments=attachments,
question=details.poll.dict() if details.poll else None,
language=details.language,
)
# Add their own timeline event for immediate visibility
TimelineEvent.add_post(request.identity, post)
@ -141,6 +142,7 @@ def edit_status(request, id: str, details: EditStatusSchema) -> schemas.Status:
sensitive=details.sensitive,
attachments=attachments,
attachment_attributes=details.media_attributes,
language=details.language,
)
return schemas.Status.from_post(post)

Wyświetl plik

@ -1,5 +1,6 @@
import datetime
import os
import re
import urllib.parse as urllib_parse
from dateutil import parser
@ -692,3 +693,24 @@ def media_type_from_filename(filename):
return "image/webp"
else:
return "application/octet-stream"
def get_language(data) -> str | None:
"""Detects and returns a document's language"""
map_ = None
if "contentMap" in data:
map_ = data["contentMap"]
elif "nameMap" in data:
map_ = data["nameMap"]
elif "summaryMap" in data:
map_ = data["summaryMap"]
if not map_:
return None
lang = list(map_.keys())[0]
if not lang or lang == "und":
return None
lang = re.split("-|_", lang)[0]
return lang.lower()

Wyświetl plik

@ -259,6 +259,7 @@ def test_content_map(remote_identity):
create=True,
)
assert post.content == "Hi World"
assert post.language is None
post2 = Post.by_ap(
data={
@ -271,6 +272,7 @@ def test_content_map(remote_identity):
create=True,
)
assert post2.content == "Hey World"
assert post2.language is None
post3 = Post.by_ap(
data={
@ -283,6 +285,7 @@ def test_content_map(remote_identity):
create=True,
)
assert post3.content == "Hello World"
assert post3.language == "en"
@pytest.mark.django_db

Wyświetl plik

@ -2,7 +2,7 @@ import datetime
from dateutil.tz import tzutc
from core.ld import parse_ld_date
from core.ld import get_language, parse_ld_date
def test_parse_ld_date():
@ -41,3 +41,41 @@ def test_parse_ld_date():
tzinfo=tzutc(),
)
assert difference.total_seconds() == 0
def test_get_language():
assert (
get_language(
{
"contentMap": {
"en": "<p>Hello</p>",
"es": "<p>hola</p>",
},
"nameMap": {"de": "Hallo"},
"summaryMap": {"fr": "Bonjour"},
}
)
== "en"
)
assert (
get_language(
{
"nameMap": {"de": "Hallo"},
"summaryMap": {"fr": "Bonjour"},
}
)
== "de"
)
assert (
get_language(
{
"summaryMap": {"fr": "Bonjour"},
}
)
== "fr"
)
assert get_language({"contentMap": {"en-gb": "<p>Hello</p>"}}) == "en"
assert get_language({"contentMap": {"en_GB": "<p>Hello</p>"}}) == "en"
assert get_language({"contentMap": {"EN": "<p>Hello</p>"}}) == "en"
assert get_language({"contentMap": {"und": "<p>Hello</p>"}}) is None
assert get_language({}) is None