From 712c6d2c46389247be0e97e15042a7cd7b1071a1 Mon Sep 17 00:00:00 2001 From: Jason Robinson Date: Mon, 17 Feb 2020 21:01:47 +0200 Subject: [PATCH] Add `find_tags` text utility to find hashtags from text Refs: https://git.feneas.org/socialhome/socialhome/issues/572 --- CHANGELOG.md | 2 ++ federation/tests/utils/test_text.py | 32 +++++++++++++++++++++++- federation/utils/text.py | 38 +++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 90601be..14324d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ * Add `process_text_links` text utility to linkify URL's in text. +* Add `find_tags` text utility to find hashtags from text. + ### Changed * The NodeInfo2 hostmeta parser now cleans the port out of the host name. diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index 32aecf4..61c10fc 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -1,4 +1,4 @@ -from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links +from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links, find_tags def test_decode_if_bytes(): @@ -11,6 +11,36 @@ def test_encode_if_text(): assert encode_if_text("foobar") == b"foobar" +class TestFindTags: + def test_factory_instance_has_tags(self): + assert find_tags("**Foobar** #tag #othertag") == {"tag", "othertag"} + + def test_extract_tags_adds_new_tags(self): + assert find_tags("#post **Foobar** #tag #OtherTag #third\n#fourth") == { + "third", "fourth", "post", "othertag", "tag", + } + + def test_all_tags_are_parsed_from_text(self): + assert find_tags("#starting and #MixED with some #line\nendings also tags can\n#start on new line") == \ + {"starting", "mixed", "line", "start"} + + def test_invalid_text_returns_no_tags(self): + assert find_tags("#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a #a?a #a`a " + "#a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd") == set() + + def test_endings_are_filtered_out(self): + assert find_tags("#parenthesis) #exp! #list]") == {"parenthesis", "exp", "list"} + + def test_prefixed_tags(self): + assert find_tags("(#foo [#bar") == {"foo", "bar"} + + def test_postfixed_tags(self): + assert find_tags("#foo) #bar] #hoo, #hee.") == {"foo", "bar", "hoo", "hee"} + + def test_code_block_tags_ignored(self): + assert find_tags("foo\n```\n#code\n```\n#notcode\n\n #alsocode\n") == {"notcode"} + + class TestProcessTextLinks: def test_link_at_start_or_end(self): assert process_text_links('https://example.org example.org\nhttp://example.org') == \ diff --git a/federation/utils/text.py b/federation/utils/text.py index ad67973..aff0699 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -1,9 +1,12 @@ import re +from typing import Set from urllib.parse import urlparse import bleach from bleach import callbacks +ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0" + def decode_if_bytes(text): try: @@ -19,6 +22,31 @@ def encode_if_text(text): return text +def find_tags(text: str) -> Set: + """Find tags in text. + + Tries to ignore tags inside code blocks. + """ + found_tags = set() + lines = text.splitlines(keepends=True) + code_block = False + # Check each line separately + for line in lines: + if line[0:3] == "```": + code_block = not code_block + if line.find("#") == -1 or line[0:4] == " " or code_block: + continue + # Check each word separately + words = line.split(" ") + for word in words: + candidate = word.strip().strip("([]),.!?:") + if candidate.startswith("#"): + candidate = candidate.strip("#") + if test_tag(candidate.lower()): + found_tags.add(candidate.lower()) + return found_tags + + def get_path_from_url(url: str) -> str: """ Return only the path part of an URL. @@ -50,6 +78,16 @@ def process_text_links(text): ) +def test_tag(tag: str) -> bool: + """Test a word whether it could be accepted as a tag.""" + if not tag: + return False + for char in ILLEGAL_TAG_CHARS: + if char in tag: + return False + return True + + def validate_handle(handle): """ Very basic handle validation as per