add status length counter

2025-08-16 20:51:25 +03:00 · 2025-08-16 20:51:25 +03:00 · 2174694464
commit 2174694464
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@ -16,6 +16,7 @@ v2.1.0 (IN PROGRESS)
 * Added pagination iteraton via `pagination_iterator` (Thanks @FredericoCeratto for the suggestion)
 * Added a way to get pagination info out of lists that is slightly less digging-around-in-internals via `get_pagination_info` (Thanks @s427 for the inciting report)
 * Added missing `replies_policy` and `exclusive` parameters to list creation and update methods.
+* Add status length counter `get_status_length` (Thanks @yuletide for the suggestion)

 v2.0.1
 ------
--- a/docs/12_utilities.rst
+++ b/docs/12_utilities.rst
@ -33,3 +33,4 @@ Cache control
 Other utilities
 ---------------
 .. automethod:: Mastodon.get_approx_server_time
+.. automethod:: Mastodon.get_status_length
--- a/mastodon/_url_regex.py
+++ b/mastodon/_url_regex.py
--- a/mastodon/compat.py
+++ b/mastodon/compat.py
@ -45,3 +45,9 @@ except:
    class Path:
        pass

+IMPL_HAS_GRAPHEME = True
+try:
+    import grapheme
+except:
+    IMPL_HAS_GRAPHEME = False
+    grapheme = None
--- a/mastodon/utility.py
+++ b/mastodon/utility.py
@ -7,7 +7,7 @@ import copy
 import warnings

 from mastodon.errors import MastodonAPIError, MastodonIllegalArgumentError, MastodonNotFoundError, MastodonVersionError
-from mastodon.compat import IMPL_HAS_BLURHASH, blurhash
+from mastodon.compat import IMPL_HAS_BLURHASH, blurhash, IMPL_HAS_GRAPHEME, grapheme
 from mastodon.internals import Mastodon as Internals

 from mastodon.versions import parse_version_string, max_version, api_version
@ -16,8 +16,8 @@ from typing import Optional, Union, Dict, Iterator
 from mastodon.return_types import PaginatableList, PaginationInfo, PaginatableList
 from mastodon.types_base import Entity, try_cast

-# Class level:
-
+from ._url_regex import url_regex
+import unicodedata

 class Mastodon(Internals):
    def set_language(self, lang):
@ -320,3 +320,30 @@ class Mastodon(Internals):
                current_page = self.fetch_next(current_page)
            else:
                current_page = self.fetch_previous(current_page)
+
+    @staticmethod
+    def get_status_length(text: str, spoiler_text: str = "") -> int:
+        """
+        For a given status `text` and `spoiler_text`, return how many characters this status counts as
+        when computing the status length and comparing it against the limit.
+
+        Note that there are other limits you may run into, such as the maximum length of a URL, or the
+        maximum length of a usernames domain part. But as long as you do *normal* things, this function
+        will return the correct length for the status text.
+        """
+        if not IMPL_HAS_GRAPHEME:
+            raise NotImplementedError(
+                'To use the get_status_length function, please install the grapheme Python module.')
+        
+        username_regex = re.compile(r'(^|[^/\w])@(([a-z0-9_]+)@[a-z0-9\.\-]+[a-z0-9]+)', re.IGNORECASE)
+
+        def countable_text(input_text: str) -> str:
+            # Transform text such that it has the correct length for counting
+            # post text lengths against the limit
+            def _url_repl(m: re.Match) -> str:
+                return m.group(2) + ("x" * 23)
+            text = url_regex.sub(_url_repl, input_text)
+            text = username_regex.sub(r'\1@\3', text)
+            return text
+
+        return grapheme.length(countable_text(text)) + grapheme.length(spoiler_text)
--- a/tests/test_status_length.py
+++ b/tests/test_status_length.py
@ -0,0 +1,58 @@
+
+import pytest
+
+from  mastodon import Mastodon
+
+TEST_CASES = [
+    # Simple
+    ("", 0),
+    ("hello", 5),
+    (" leading and trailing spaces   ", 31),
+    (" tabs\tand\nnewlines\r\n", 19),
+
+    # URLs - schemes, TLDs, IPv4/IPv6, ports, creds
+    ("check http://example.com and https://example.org/page?x=1#frag", 1000 - 943),
+    ("ftp://files.example.net/resource", 1000 - 968),
+    ("http://user:pass@example.com:8080/path", 1000 - 962),
+    ("http://127.0.0.1:3000/health", 1000 - 972),
+    ("https://[2001:db8::1]/status", 1000 - 972),
+    ("https://[2001:db8:85a3::8a2e:370:7334]:443/path?ok=1", 1000 - 948),
+    ("mailto:someone@example.com", 1000 - 974),
+    ("git+ssh://git@example.co.uk:22/repo.git", 1000 - 961),
+    ("https://very.long.tld.example.museum/collection/item", 1000 - 977),
+
+    # Usernames - local and remote
+    ("@alice", 6),
+    ("@bob@example.com", 4),
+    ("hi @charlie and @dora@example.social!", 1000 -978),
+
+    # Mixed
+    ("hey @me@example.com look at https://example.com/a-b_c~d?e=f#g and @you  ", 50),
+
+    # Grapheme cluster vs code point differences
+    ("a: 🇪🇪", 4),
+    ("b: 👨‍👩‍👧‍👦", 4),
+    ("c: 👩🏽‍💻", 4),
+    ("d: ✊🏿", 4),
+    ("é", 1),
+    ("f\u0301", 1),
+
+    # Stress-tests
+    ("https://sub.sub2.пример.рф/путь/страница?параметр=значение#якорь", 47),
+    ("clusters: 😀😃😄😁😆😅😂🤣😊🙂😉🙃😇🥰😍🤩😘😗😙😚", 30),
+
+    # Varied compositions
+    ("See: http://example.com https://[2001:db8::2]:8443/a ftp://user:pw@files.example.org:21/x http://192.168.0.1/", 1000 - 886),
+    ("@one https://example.social/@two 👩🏽‍💻 🇪🇪 @three@example.com ✊🏿", 1000 - 959),
+
+    # Edge punctuation around URLs/usernames
+    ("(see https://example.com.)", 30),
+    ("[link: http://user:pass@host.example:8080/path?x=y#z]", 1000 - 947),
+    ("<@root> and {@admin@example.net}", 20),
+    ("https://example.com/a-b_c~d?param_a=1&param-b=2", 1000 - 977),
+]
+
+@pytest.mark.parametrize("text,expected", TEST_CASES)
+def test_get_status_length_against_ground_truth(text, expected):
+    assert Mastodon.get_status_length(text) == expected
+    assert Mastodon.get_status_length(text, "what") == expected + 4
--- a/tox.ini
+++ b/tox.ini
@ -4,5 +4,5 @@ skipsdist = true


 [testenv]
-deps = .[test,webpush,blurhash]
+deps = .[test,webpush,blurhash,grapheme]
 commands = python setup.py test