diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 269c734..862b6cd 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -856,7 +856,7 @@ class Note(Object, RawContentMixin): parsed = urlparse(unquote(link['href']).lower()) # remove the query part and trailing garbage, if any path = parsed.path - trunc = re.match(r'(/[\w/]+)', parsed.path) + trunc = re.match(r'(/[\w/\-]+)', parsed.path) if trunc: path = trunc.group() url = f'{parsed.scheme}://{parsed.netloc}{path}' @@ -865,8 +865,9 @@ class Note(Object, RawContentMixin): normalized_url = f'{parsed.scheme}://{parsed.netloc}{normalized_path.decode()}' links = {link['href'].lower(), unquote(link['href']).lower(), url, normalized_url} if links.intersection(hrefs): - tag = re.match(r'#?([\w]+)', link.text).group(1).lower() - link['data-hashtag'] = tag + tag = re.match(r'^#?([\w\-]+$)', link.text) + if tag: + link['data-hashtag'] = tag.group(1).lower() def _find_and_mark_mentions(self): mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)] diff --git a/federation/utils/text.py b/federation/utils/text.py index f66f437..7d728dd 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -7,7 +7,7 @@ from bs4.element import NavigableString from commonmark import commonmark ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0" -TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE) +TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE) # This will match non matching braces. I don't think it's an issue. MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE) URL_PATTERN = re.compile(r'(^|[#*_\s])((?:https?://)?[\w\-.]+\.[\w]{1}[\w_\-.#?&/~@!$()*,;%=+]*)', re.UNICODE) @@ -56,7 +56,8 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr ns = [NavigableString(r) for r in re.split(pattern, candidate.text)] if ns: candidate.replace_with(*ns) - found.extend([child for child in parent.find_all(string=pattern) if child in ns]) + found.extend([child for child in parent.find_all( + string=re.compile(r'\A'+pattern.pattern+r'\Z')) if child in ns]) return found