From 4b5a886492f6bef5c7b3f07d27df76530c240280 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Mon, 17 Jul 2023 11:36:24 -0400 Subject: [PATCH] Match links with no http prefix. Remove trailing garbage from tags. --- federation/entities/activitypub/models.py | 8 +++++--- federation/utils/text.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index e7afbbe..8989440 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -849,11 +849,13 @@ class Note(Object, RawContentMixin): for link in self._soup.find_all('a', href=True): parsed = urlparse(link['href'].lower()) - # remove the query part, if any - url = f'{parsed.scheme}://{parsed.netloc}{parsed.path}' + # remove the query part and trailing garbage, if any + path = re.match(r'(/[\w/]+)', parsed.path).group() + url = f'{parsed.scheme}://{parsed.netloc}{path}' links = {link['href'].lower(), url} if links.intersection(hrefs): - link['data-hashtag'] = link.text.lstrip('#').lower() + tag = re.match(r'#?([\w]+)', link.text).group(1).lower() + link['data-hashtag'] = tag def _find_and_mark_mentions(self): mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)] diff --git a/federation/utils/text.py b/federation/utils/text.py index 8ce6478..9d62c04 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -10,7 +10,7 @@ ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0" TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE) # This will match non matching braces. I don't think it's an issue. MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE) -URL_PATTERN = re.compile(r'(https?://[\w_\-.#?&/]+)', re.UNICODE) +URL_PATTERN = re.compile(r'((?:https?://)?[\w_\-.#?&/~@!$()*,;%=+]+)', re.UNICODE) def decode_if_bytes(text): try: @@ -52,7 +52,7 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr for candidate in soup.find_all(string=True): if candidate.parent.name == 'code': continue ns = [NavigableString(r) for r in re.split(pattern, candidate.text)] - candidate.replace_with(*ns) + if ns: candidate.replace_with(*ns) return list(soup.find_all(string=re.compile(r'\A'+pattern.pattern+r'\Z')))