From 9cb2509ab6cb75c4a8596a8bd7f5e70d48806aff Mon Sep 17 00:00:00 2001
From: Jason Robinson <mail@jasonrobinson.me>
Date: Mon, 17 Feb 2020 21:58:00 +0200
Subject: [PATCH] Add replacer functionality to the find_tags utility

---
 CHANGELOG.md                        |  3 +-
 federation/entities/mixins.py       |  2 +-
 federation/tests/utils/test_text.py | 80 ++++++++++++++++++++---------
 federation/utils/text.py            | 29 +++++++++--
 4 files changed, 86 insertions(+), 28 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1ee6bf8..e311c0f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,7 +11,8 @@
   
 * Add `process_text_links` text utility to linkify URL's in text.
 
-* Add `find_tags` text utility to find hashtags from text. This utility is used
+* Add `find_tags` text utility to find hashtags from text. Optionally the function can
+  also replace the tags through a given `replacer` function. This utility is used
   to improve the tag extraction logic from entities text fields. ([related issue](https://git.feneas.org/jaywink/federation/issues/70))
 
 ### Changed
diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py
index c7a6a1c..4a00ba4 100644
--- a/federation/entities/mixins.py
+++ b/federation/entities/mixins.py
@@ -230,7 +230,7 @@ class RawContentMixin(BaseEntity):
         """Returns a `list` of unique tags contained in `raw_content`."""
         if not self.raw_content:
             return []
-        tags = find_tags(self.raw_content)
+        tags, _text = find_tags(self.raw_content)
         return sorted(tags)
 
     def extract_mentions(self):
diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py
index 61c10fc..6f0c1b1 100644
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@@ -12,33 +12,67 @@ def test_encode_if_text():
 
 
 class TestFindTags:
-    def test_factory_instance_has_tags(self):
-        assert find_tags("**Foobar** #tag #othertag") == {"tag", "othertag"}
-
-    def test_extract_tags_adds_new_tags(self):
-        assert find_tags("#post **Foobar** #tag #OtherTag #third\n#fourth") == {
-            "third", "fourth", "post", "othertag", "tag",
-        }
+    @staticmethod
+    def _replacer(text):
+        return f"#{text}/{text.lower()}"
 
     def test_all_tags_are_parsed_from_text(self):
-        assert find_tags("#starting and #MixED with some #line\nendings also tags can\n#start on new line") == \
-            {"starting", "mixed", "line", "start"}
-
-    def test_invalid_text_returns_no_tags(self):
-        assert find_tags("#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a #a?a #a`a "
-                         "#a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd") == set()
-
-    def test_endings_are_filtered_out(self):
-        assert find_tags("#parenthesis) #exp! #list]") == {"parenthesis", "exp", "list"}
-
-    def test_prefixed_tags(self):
-        assert find_tags("(#foo [#bar") == {"foo", "bar"}
-
-    def test_postfixed_tags(self):
-        assert find_tags("#foo) #bar] #hoo, #hee.") == {"foo", "bar", "hoo", "hee"}
+        source = "#starting and #MixED with some #line\nendings also tags can\n#start on new line"
+        tags, text = find_tags(source)
+        assert tags == {"starting", "mixed", "line", "start"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#starting/starting and #MixED/mixed with some #line/line\nendings also tags can\n" \
+                       "#start/start on new line"
 
     def test_code_block_tags_ignored(self):
-        assert find_tags("foo\n```\n#code\n```\n#notcode\n\n    #alsocode\n") == {"notcode"}
+        source = "foo\n```\n#code\n```\n#notcode\n\n    #alsocode\n"
+        tags, text = find_tags(source)
+        assert tags == {"notcode"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n    #alsocode\n"
+
+    def test_endings_are_filtered_out(self):
+        source = "#parenthesis) #exp! #list]"
+        tags, text = find_tags(source)
+        assert tags == {"parenthesis", "exp", "list"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]"
+
+    def test_finds_tags(self):
+        source = "#post **Foobar** #tag #OtherTag #third\n#fourth"
+        tags, text = find_tags(source)
+        assert tags == {"third", "fourth", "post", "othertag", "tag"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth"
+
+    def test_postfixed_tags(self):
+        source = "#foo) #bar] #hoo, #hee."
+        tags, text = find_tags(source)
+        assert tags == {"foo", "bar", "hoo", "hee"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#foo/foo) #bar/bar] #hoo/hoo, #hee/hee."
+
+    def test_prefixed_tags(self):
+        source = "(#foo [#bar"
+        tags, text = find_tags(source)
+        assert tags == {"foo", "bar"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "(#foo/foo [#bar/bar"
+
+    def test_invalid_text_returns_no_tags(self):
+        source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \
+                 "#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd"
+        tags, text = find_tags(source)
+        assert tags == set()
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == source
 
 
 class TestProcessTextLinks:
diff --git a/federation/utils/text.py b/federation/utils/text.py
index aff0699..8bfb02b 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -1,5 +1,5 @@
 import re
-from typing import Set
+from typing import Set, Tuple
 from urllib.parse import urlparse
 
 import bleach
@@ -22,19 +22,29 @@ def encode_if_text(text):
         return text
 
 
-def find_tags(text: str) -> Set:
+def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
     """Find tags in text.
 
     Tries to ignore tags inside code blocks.
+
+    Optionally, if passed a "replacer", will also replace the tag word with the result
+    of the replacer function called with the tag word.
+
+    Returns a set of tags and the original or replaced text.
     """
     found_tags = set()
     lines = text.splitlines(keepends=True)
+    final_lines = []
     code_block = False
+    final_text = None
     # Check each line separately
     for line in lines:
+        final_words = []
         if line[0:3] == "```":
             code_block = not code_block
         if line.find("#") == -1 or line[0:4] == "    " or code_block:
+            # Just add the whole line
+            final_lines.append(line)
             continue
         # Check each word separately
         words = line.split(" ")
@@ -44,7 +54,20 @@ def find_tags(text: str) -> Set:
                 candidate = candidate.strip("#")
                 if test_tag(candidate.lower()):
                     found_tags.add(candidate.lower())
-    return found_tags
+                    if replacer:
+                        try:
+                            tag_word = word.replace("#%s" % candidate, replacer(candidate))
+                            final_words.append(tag_word)
+                        except Exception:
+                            final_words.append(word)
+                else:
+                    final_words.append(word)
+            else:
+                final_words.append(word)
+        final_lines.append(" ".join(final_words))
+    if replacer:
+        final_text = "".join(final_lines)
+    return found_tags, final_text or text
 
 
 def get_path_from_url(url: str) -> str: