From 5267e4108cf253f59e71e50c1ac867cf7a34c8d8 Mon Sep 17 00:00:00 2001 From: Henri Dickson <90480431+alphatownsman@users.noreply.github.com> Date: Sun, 19 Nov 2023 11:58:20 -0500 Subject: [PATCH] Allow unicode characters in hashtag (#659) --- core/html.py | 2 +- tests/core/test_html.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/core/html.py b/core/html.py index 0e0cf60..6f8c5e2 100644 --- a/core/html.py +++ b/core/html.py @@ -38,7 +38,7 @@ class FediverseHtmlParser(HTMLParser): r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)" ) - HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)") + HASHTAG_REGEX = re.compile(r"\B#([\w()]+\b)(?!;)") EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B") diff --git a/tests/core/test_html.py b/tests/core/test_html.py index d2ce52f..493a944 100644 --- a/tests/core/test_html.py +++ b/tests/core/test_html.py @@ -1,4 +1,5 @@ import pytest +from django.template.defaultfilters import linebreaks_filter from core.html import FediverseHtmlParser @@ -101,6 +102,16 @@ def test_parser(identity): assert parser.plain_text == "@TeSt@ExamPle.com" assert parser.mentions == {"test@example.com"} + # Ensure hashtags are parsed and linkified in local posts + parser = FediverseHtmlParser( + linebreaks_filter("#tag1-x,#tag2 #标签。"), find_hashtags=True + ) + assert ( + parser.html + == '

-x,

' + ) + assert parser.hashtags == {"tag1", "tag2", "标签"} + # Ensure hashtags are linked, even through spans, but not within hrefs parser = FediverseHtmlParser( 'something #hashtag #hashtagtwo',