add status length counter

pull/419/head
halcy 2025-08-16 20:51:25 +03:00
rodzic 974844bacb
commit 2174694464
7 zmienionych plików z 103 dodań i 4 usunięć

Wyświetl plik

@ -16,6 +16,7 @@ v2.1.0 (IN PROGRESS)
* Added pagination iteraton via `pagination_iterator` (Thanks @FredericoCeratto for the suggestion)
* Added a way to get pagination info out of lists that is slightly less digging-around-in-internals via `get_pagination_info` (Thanks @s427 for the inciting report)
* Added missing `replies_policy` and `exclusive` parameters to list creation and update methods.
* Add status length counter `get_status_length` (Thanks @yuletide for the suggestion)
v2.0.1
------

Wyświetl plik

@ -33,3 +33,4 @@ Cache control
Other utilities
---------------
.. automethod:: Mastodon.get_approx_server_time
.. automethod:: Mastodon.get_status_length

File diff suppressed because one or more lines are too long

Wyświetl plik

@ -45,3 +45,9 @@ except:
class Path:
pass
IMPL_HAS_GRAPHEME = True
try:
import grapheme
except:
IMPL_HAS_GRAPHEME = False
grapheme = None

Wyświetl plik

@ -7,7 +7,7 @@ import copy
import warnings
from mastodon.errors import MastodonAPIError, MastodonIllegalArgumentError, MastodonNotFoundError, MastodonVersionError
from mastodon.compat import IMPL_HAS_BLURHASH, blurhash
from mastodon.compat import IMPL_HAS_BLURHASH, blurhash, IMPL_HAS_GRAPHEME, grapheme
from mastodon.internals import Mastodon as Internals
from mastodon.versions import parse_version_string, max_version, api_version
@ -16,8 +16,8 @@ from typing import Optional, Union, Dict, Iterator
from mastodon.return_types import PaginatableList, PaginationInfo, PaginatableList
from mastodon.types_base import Entity, try_cast
# Class level:
from ._url_regex import url_regex
import unicodedata
class Mastodon(Internals):
def set_language(self, lang):
@ -320,3 +320,30 @@ class Mastodon(Internals):
current_page = self.fetch_next(current_page)
else:
current_page = self.fetch_previous(current_page)
@staticmethod
def get_status_length(text: str, spoiler_text: str = "") -> int:
"""
For a given status `text` and `spoiler_text`, return how many characters this status counts as
when computing the status length and comparing it against the limit.
Note that there are other limits you may run into, such as the maximum length of a URL, or the
maximum length of a usernames domain part. But as long as you do *normal* things, this function
will return the correct length for the status text.
"""
if not IMPL_HAS_GRAPHEME:
raise NotImplementedError(
'To use the get_status_length function, please install the grapheme Python module.')
username_regex = re.compile(r'(^|[^/\w])@(([a-z0-9_]+)@[a-z0-9\.\-]+[a-z0-9]+)', re.IGNORECASE)
def countable_text(input_text: str) -> str:
# Transform text such that it has the correct length for counting
# post text lengths against the limit
def _url_repl(m: re.Match) -> str:
return m.group(2) + ("x" * 23)
text = url_regex.sub(_url_repl, input_text)
text = username_regex.sub(r'\1@\3', text)
return text
return grapheme.length(countable_text(text)) + grapheme.length(spoiler_text)

Wyświetl plik

@ -0,0 +1,58 @@
import pytest
from mastodon import Mastodon
TEST_CASES = [
# Simple
("", 0),
("hello", 5),
(" leading and trailing spaces ", 31),
(" tabs\tand\nnewlines\r\n", 19),
# URLs - schemes, TLDs, IPv4/IPv6, ports, creds
("check http://example.com and https://example.org/page?x=1#frag", 1000 - 943),
("ftp://files.example.net/resource", 1000 - 968),
("http://user:pass@example.com:8080/path", 1000 - 962),
("http://127.0.0.1:3000/health", 1000 - 972),
("https://[2001:db8::1]/status", 1000 - 972),
("https://[2001:db8:85a3::8a2e:370:7334]:443/path?ok=1", 1000 - 948),
("mailto:someone@example.com", 1000 - 974),
("git+ssh://git@example.co.uk:22/repo.git", 1000 - 961),
("https://very.long.tld.example.museum/collection/item", 1000 - 977),
# Usernames - local and remote
("@alice", 6),
("@bob@example.com", 4),
("hi @charlie and @dora@example.social!", 1000 -978),
# Mixed
("hey @me@example.com look at https://example.com/a-b_c~d?e=f#g and @you ", 50),
# Grapheme cluster vs code point differences
("a: 🇪🇪", 4),
("b: 👨‍👩‍👧‍👦", 4),
("c: 👩🏽‍💻", 4),
("d: ✊🏿", 4),
("é", 1),
("f\u0301", 1),
# Stress-tests
("https://sub.sub2.пример.рф/путь/страница?параметр=значение#якорь", 47),
("clusters: 😀😃😄😁😆😅😂🤣😊🙂😉🙃😇🥰😍🤩😘😗😙😚", 30),
# Varied compositions
("See: http://example.com https://[2001:db8::2]:8443/a ftp://user:pw@files.example.org:21/x http://192.168.0.1/", 1000 - 886),
("@one https://example.social/@two 👩🏽‍💻 🇪🇪 @three@example.com ✊🏿", 1000 - 959),
# Edge punctuation around URLs/usernames
("(see https://example.com.)", 30),
("[link: http://user:pass@host.example:8080/path?x=y#z]", 1000 - 947),
("<@root> and {@admin@example.net}", 20),
("https://example.com/a-b_c~d?param_a=1&param-b=2", 1000 - 977),
]
@pytest.mark.parametrize("text,expected", TEST_CASES)
def test_get_status_length_against_ground_truth(text, expected):
assert Mastodon.get_status_length(text) == expected
assert Mastodon.get_status_length(text, "what") == expected + 4

Wyświetl plik

@ -4,5 +4,5 @@ skipsdist = true
[testenv]
deps = .[test,webpush,blurhash]
deps = .[test,webpush,blurhash,grapheme]
commands = python setup.py test