From d577e39777e39801ce9783ddbb0254e6e45ac263 Mon Sep 17 00:00:00 2001
From: Alain St-Denis <alain@zenfolie.org>
Date: Thu, 13 Jul 2023 11:09:00 -0400
Subject: [PATCH] Do not assume that the last part of a mention.href is the
 user's name. Adjust patterns to match a leading whitespace or the beginning.

---
 federation/entities/activitypub/models.py | 10 +++++++---
 federation/utils/text.py                  |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 5bdcff7..f9df860 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -857,9 +857,13 @@ class Note(Object, RawContentMixin):
 
     def _find_and_mark_mentions(self):
         mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)]
-        hrefs = [mention.href for mention in mentions]
-        # add Mastodon's form
-        hrefs.extend([re.sub(r'/(users/)([\w]+)$', r'/@\2', href) for href in hrefs])
+        hrefs = []
+        for mention in mentions:
+            hrefs.append(mention.href)
+            # add Mastodon's form
+            parsed = urlparse(mention.href)
+            username = mention.name.lstrip('@').split('@')[0]
+            hrefs.append(f'{parsed.scheme}://{parsed.netloc}/@{username}')
         for href in hrefs:
             links = self._soup.find_all(href=href)
             for link in links:
diff --git a/federation/utils/text.py b/federation/utils/text.py
index e2cd78c..3291fe8 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -9,8 +9,8 @@ from bs4.element import NavigableString
 from commonmark import commonmark
 
 ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
-TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE)
-MENTION_PATTERN = re.compile(r'(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE)
+TAG_PATTERN = re.compile(r'(^|\s)(#[\w]+)', re.UNICODE)
+MENTION_PATTERN = re.compile(r'(^|\s)(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE)
 
 
 def decode_if_bytes(text):