Add `find_tags` text utility to find hashtags from text

Refs: https://git.feneas.org/socialhome/socialhome/issues/572
2020-02-17 21:01:47 +02:00 · 2020-02-17 21:01:47 +02:00 · 712c6d2c46
commit 712c6d2c46
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,6 +11,8 @@
  
 * Add `process_text_links` text utility to linkify URL's in text.

+* Add `find_tags` text utility to find hashtags from text.
+
 ### Changed

 * The NodeInfo2 hostmeta parser now cleans the port out of the host name.
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@ -1,4 +1,4 @@
-from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links
+from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links, find_tags


 def test_decode_if_bytes():
@ -11,6 +11,36 @@ def test_encode_if_text():
    assert encode_if_text("foobar") == b"foobar"


+class TestFindTags:
+    def test_factory_instance_has_tags(self):
+        assert find_tags("**Foobar** #tag #othertag") == {"tag", "othertag"}
+
+    def test_extract_tags_adds_new_tags(self):
+        assert find_tags("#post **Foobar** #tag #OtherTag #third\n#fourth") == {
+            "third", "fourth", "post", "othertag", "tag",
+        }
+
+    def test_all_tags_are_parsed_from_text(self):
+        assert find_tags("#starting and #MixED with some #line\nendings also tags can\n#start on new line") == \
+            {"starting", "mixed", "line", "start"}
+
+    def test_invalid_text_returns_no_tags(self):
+        assert find_tags("#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a #a?a #a`a "
+                         "#a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd") == set()
+
+    def test_endings_are_filtered_out(self):
+        assert find_tags("#parenthesis) #exp! #list]") == {"parenthesis", "exp", "list"}
+
+    def test_prefixed_tags(self):
+        assert find_tags("(#foo [#bar") == {"foo", "bar"}
+
+    def test_postfixed_tags(self):
+        assert find_tags("#foo) #bar] #hoo, #hee.") == {"foo", "bar", "hoo", "hee"}
+
+    def test_code_block_tags_ignored(self):
+        assert find_tags("foo\n```\n#code\n```\n#notcode\n\n    #alsocode\n") == {"notcode"}
+
+
 class TestProcessTextLinks:
    def test_link_at_start_or_end(self):
        assert process_text_links('https://example.org example.org\nhttp://example.org') == \
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@ -1,9 +1,12 @@
 import re
+from typing import Set
 from urllib.parse import urlparse

 import bleach
 from bleach import callbacks

+ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
+

 def decode_if_bytes(text):
    try:
@ -19,6 +22,31 @@ def encode_if_text(text):
        return text


+def find_tags(text: str) -> Set:
+    """Find tags in text.
+
+    Tries to ignore tags inside code blocks.
+    """
+    found_tags = set()
+    lines = text.splitlines(keepends=True)
+    code_block = False
+    # Check each line separately
+    for line in lines:
+        if line[0:3] == "```":
+            code_block = not code_block
+        if line.find("#") == -1 or line[0:4] == "    " or code_block:
+            continue
+        # Check each word separately
+        words = line.split(" ")
+        for word in words:
+            candidate = word.strip().strip("([]),.!?:")
+            if candidate.startswith("#"):
+                candidate = candidate.strip("#")
+                if test_tag(candidate.lower()):
+                    found_tags.add(candidate.lower())
+    return found_tags
+
+
 def get_path_from_url(url: str) -> str:
    """
    Return only the path part of an URL.
@ -50,6 +78,16 @@ def process_text_links(text):
    )


+def test_tag(tag: str) -> bool:
+    """Test a word whether it could be accepted as a tag."""
+    if not tag:
+        return False
+    for char in ILLEGAL_TAG_CHARS:
+        if char in tag:
+            return False
+    return True
+
+
 def validate_handle(handle):
    """
    Very basic handle validation as per