Add `find_tags` text utility to find hashtags from text

Refs: https://git.feneas.org/socialhome/socialhome/issues/572
2020-02-17 21:01:47 +02:00 · 2020-02-17 21:01:47 +02:00 · 712c6d2c46
commit 712c6d2c46
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,6 +11,8 @@
 * Add `process_text_links` text utility to linkify URL's in text.
 * Add `find_tags` text utility to find hashtags from text.
 ### Changed
 * The NodeInfo2 hostmeta parser now cleans the port out of the host name.
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@ -1,4 +1,4 @@
-from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links
+from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links, find_tags
 def test_decode_if_bytes():
@ -11,6 +11,36 @@ def test_encode_if_text():
    assert encode_if_text("foobar") == b"foobar"
 class TestFindTags:
    def test_factory_instance_has_tags(self):
        assert find_tags("**Foobar** #tag #othertag") == {"tag", "othertag"}
    def test_extract_tags_adds_new_tags(self):
        assert find_tags("#post **Foobar** #tag #OtherTag #third\n#fourth") == {
            "third", "fourth", "post", "othertag", "tag",
        }
    def test_all_tags_are_parsed_from_text(self):
        assert find_tags("#starting and #MixED with some #line\nendings also tags can\n#start on new line") == \
            {"starting", "mixed", "line", "start"}
    def test_invalid_text_returns_no_tags(self):
        assert find_tags("#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a #a?a #a`a "
                         "#a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd") == set()
    def test_endings_are_filtered_out(self):
        assert find_tags("#parenthesis) #exp! #list]") == {"parenthesis", "exp", "list"}
    def test_prefixed_tags(self):
        assert find_tags("(#foo [#bar") == {"foo", "bar"}
    def test_postfixed_tags(self):
        assert find_tags("#foo) #bar] #hoo, #hee.") == {"foo", "bar", "hoo", "hee"}
    def test_code_block_tags_ignored(self):
        assert find_tags("foo\n```\n#code\n```\n#notcode\n\n    #alsocode\n") == {"notcode"}
 class TestProcessTextLinks:
    def test_link_at_start_or_end(self):
        assert process_text_links('https://example.org example.org\nhttp://example.org') == \
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@ -1,9 +1,12 @@
 import re
 from typing import Set
 from urllib.parse import urlparse
 import bleach
 from bleach import callbacks
 ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
 def decode_if_bytes(text):
    try:
@ -19,6 +22,31 @@ def encode_if_text(text):
        return text
 def find_tags(text: str) -> Set:
    """Find tags in text.
    Tries to ignore tags inside code blocks.
    """
    found_tags = set()
    lines = text.splitlines(keepends=True)
    code_block = False
    # Check each line separately
    for line in lines:
        if line[0:3] == "```":
            code_block = not code_block
        if line.find("#") == -1 or line[0:4] == "    " or code_block:
            continue
        # Check each word separately
        words = line.split(" ")
        for word in words:
            candidate = word.strip().strip("([]),.!?:")
            if candidate.startswith("#"):
                candidate = candidate.strip("#")
                if test_tag(candidate.lower()):
                    found_tags.add(candidate.lower())
    return found_tags
 def get_path_from_url(url: str) -> str:
    """
    Return only the path part of an URL.
@ -50,6 +78,16 @@ def process_text_links(text):
    )
 def test_tag(tag: str) -> bool:
    """Test a word whether it could be accepted as a tag."""
    if not tag:
        return False
    for char in ILLEGAL_TAG_CHARS:
        if char in tag:
            return False
    return True
 def validate_handle(handle):
    """
    Very basic handle validation as per