Add replacer functionality to the find_tags utility

2020-02-17 21:58:00 +02:00 · 2020-02-17 21:58:00 +02:00 · 9cb2509ab6
commit 9cb2509ab6
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,7 +11,8 @@
  
 * Add `process_text_links` text utility to linkify URL's in text.

-* Add `find_tags` text utility to find hashtags from text. This utility is used
+* Add `find_tags` text utility to find hashtags from text. Optionally the function can
+  also replace the tags through a given `replacer` function. This utility is used
  to improve the tag extraction logic from entities text fields. ([related issue](https://git.feneas.org/jaywink/federation/issues/70))

 ### Changed
--- a/federation/entities/mixins.py
+++ b/federation/entities/mixins.py
@ -230,7 +230,7 @@ class RawContentMixin(BaseEntity):
        """Returns a `list` of unique tags contained in `raw_content`."""
        if not self.raw_content:
            return []
-        tags = find_tags(self.raw_content)
+        tags, _text = find_tags(self.raw_content)
        return sorted(tags)

    def extract_mentions(self):
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@ -12,33 +12,67 @@ def test_encode_if_text():


 class TestFindTags:
-    def test_factory_instance_has_tags(self):
-        assert find_tags("**Foobar** #tag #othertag") == {"tag", "othertag"}
-
-    def test_extract_tags_adds_new_tags(self):
-        assert find_tags("#post **Foobar** #tag #OtherTag #third\n#fourth") == {
-            "third", "fourth", "post", "othertag", "tag",
-        }
+    @staticmethod
+    def _replacer(text):
+        return f"#{text}/{text.lower()}"

    def test_all_tags_are_parsed_from_text(self):
-        assert find_tags("#starting and #MixED with some #line\nendings also tags can\n#start on new line") == \
-            {"starting", "mixed", "line", "start"}
-
-    def test_invalid_text_returns_no_tags(self):
-        assert find_tags("#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a #a?a #a`a "
-                         "#a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd") == set()
-
-    def test_endings_are_filtered_out(self):
-        assert find_tags("#parenthesis) #exp! #list]") == {"parenthesis", "exp", "list"}
-
-    def test_prefixed_tags(self):
-        assert find_tags("(#foo [#bar") == {"foo", "bar"}
-
-    def test_postfixed_tags(self):
-        assert find_tags("#foo) #bar] #hoo, #hee.") == {"foo", "bar", "hoo", "hee"}
+        source = "#starting and #MixED with some #line\nendings also tags can\n#start on new line"
+        tags, text = find_tags(source)
+        assert tags == {"starting", "mixed", "line", "start"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#starting/starting and #MixED/mixed with some #line/line\nendings also tags can\n" \
+                       "#start/start on new line"

    def test_code_block_tags_ignored(self):
-        assert find_tags("foo\n```\n#code\n```\n#notcode\n\n    #alsocode\n") == {"notcode"}
+        source = "foo\n```\n#code\n```\n#notcode\n\n    #alsocode\n"
+        tags, text = find_tags(source)
+        assert tags == {"notcode"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n    #alsocode\n"
+
+    def test_endings_are_filtered_out(self):
+        source = "#parenthesis) #exp! #list]"
+        tags, text = find_tags(source)
+        assert tags == {"parenthesis", "exp", "list"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]"
+
+    def test_finds_tags(self):
+        source = "#post **Foobar** #tag #OtherTag #third\n#fourth"
+        tags, text = find_tags(source)
+        assert tags == {"third", "fourth", "post", "othertag", "tag"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth"
+
+    def test_postfixed_tags(self):
+        source = "#foo) #bar] #hoo, #hee."
+        tags, text = find_tags(source)
+        assert tags == {"foo", "bar", "hoo", "hee"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#foo/foo) #bar/bar] #hoo/hoo, #hee/hee."
+
+    def test_prefixed_tags(self):
+        source = "(#foo [#bar"
+        tags, text = find_tags(source)
+        assert tags == {"foo", "bar"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "(#foo/foo [#bar/bar"
+
+    def test_invalid_text_returns_no_tags(self):
+        source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \
+                 "#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd"
+        tags, text = find_tags(source)
+        assert tags == set()
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == source


 class TestProcessTextLinks:
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@ -1,5 +1,5 @@
 import re
-from typing import Set
+from typing import Set, Tuple
 from urllib.parse import urlparse

 import bleach
@ -22,19 +22,29 @@ def encode_if_text(text):
        return text


-def find_tags(text: str) -> Set:
+def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
    """Find tags in text.

    Tries to ignore tags inside code blocks.
+
+    Optionally, if passed a "replacer", will also replace the tag word with the result
+    of the replacer function called with the tag word.
+
+    Returns a set of tags and the original or replaced text.
    """
    found_tags = set()
    lines = text.splitlines(keepends=True)
+    final_lines = []
    code_block = False
+    final_text = None
    # Check each line separately
    for line in lines:
+        final_words = []
        if line[0:3] == "```":
            code_block = not code_block
        if line.find("#") == -1 or line[0:4] == "    " or code_block:
+            # Just add the whole line
+            final_lines.append(line)
            continue
        # Check each word separately
        words = line.split(" ")
@ -44,7 +54,20 @@ def find_tags(text: str) -> Set:
                candidate = candidate.strip("#")
                if test_tag(candidate.lower()):
                    found_tags.add(candidate.lower())
-    return found_tags
+                    if replacer:
+                        try:
+                            tag_word = word.replace("#%s" % candidate, replacer(candidate))
+                            final_words.append(tag_word)
+                        except Exception:
+                            final_words.append(word)
+                else:
+                    final_words.append(word)
+            else:
+                final_words.append(word)
+        final_lines.append(" ".join(final_words))
+    if replacer:
+        final_text = "".join(final_lines)
+    return found_tags, final_text or text


 def get_path_from_url(url: str) -> str: