Add `find_tags` text utility to find hashtags from text

Refs: https://git.feneas.org/socialhome/socialhome/issues/572
merge-requests/159/head
Jason Robinson 2020-02-17 21:01:47 +02:00
rodzic 3678c520dd
commit 712c6d2c46
3 zmienionych plików z 71 dodań i 1 usunięć

Wyświetl plik

@ -11,6 +11,8 @@
* Add `process_text_links` text utility to linkify URL's in text.
* Add `find_tags` text utility to find hashtags from text.
### Changed
* The NodeInfo2 hostmeta parser now cleans the port out of the host name.

Wyświetl plik

@ -1,4 +1,4 @@
from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links
from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links, find_tags
def test_decode_if_bytes():
@ -11,6 +11,36 @@ def test_encode_if_text():
assert encode_if_text("foobar") == b"foobar"
class TestFindTags:
def test_factory_instance_has_tags(self):
assert find_tags("**Foobar** #tag #othertag") == {"tag", "othertag"}
def test_extract_tags_adds_new_tags(self):
assert find_tags("#post **Foobar** #tag #OtherTag #third\n#fourth") == {
"third", "fourth", "post", "othertag", "tag",
}
def test_all_tags_are_parsed_from_text(self):
assert find_tags("#starting and #MixED with some #line\nendings also tags can\n#start on new line") == \
{"starting", "mixed", "line", "start"}
def test_invalid_text_returns_no_tags(self):
assert find_tags("#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a #a?a #a`a "
"#a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #aa #a”a #\xa0cd") == set()
def test_endings_are_filtered_out(self):
assert find_tags("#parenthesis) #exp! #list]") == {"parenthesis", "exp", "list"}
def test_prefixed_tags(self):
assert find_tags("(#foo [#bar") == {"foo", "bar"}
def test_postfixed_tags(self):
assert find_tags("#foo) #bar] #hoo, #hee.") == {"foo", "bar", "hoo", "hee"}
def test_code_block_tags_ignored(self):
assert find_tags("foo\n```\n#code\n```\n#notcode\n\n #alsocode\n") == {"notcode"}
class TestProcessTextLinks:
def test_link_at_start_or_end(self):
assert process_text_links('https://example.org example.org\nhttp://example.org') == \

Wyświetl plik

@ -1,9 +1,12 @@
import re
from typing import Set
from urllib.parse import urlparse
import bleach
from bleach import callbacks
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
def decode_if_bytes(text):
try:
@ -19,6 +22,31 @@ def encode_if_text(text):
return text
def find_tags(text: str) -> Set:
"""Find tags in text.
Tries to ignore tags inside code blocks.
"""
found_tags = set()
lines = text.splitlines(keepends=True)
code_block = False
# Check each line separately
for line in lines:
if line[0:3] == "```":
code_block = not code_block
if line.find("#") == -1 or line[0:4] == " " or code_block:
continue
# Check each word separately
words = line.split(" ")
for word in words:
candidate = word.strip().strip("([]),.!?:")
if candidate.startswith("#"):
candidate = candidate.strip("#")
if test_tag(candidate.lower()):
found_tags.add(candidate.lower())
return found_tags
def get_path_from_url(url: str) -> str:
"""
Return only the path part of an URL.
@ -50,6 +78,16 @@ def process_text_links(text):
)
def test_tag(tag: str) -> bool:
"""Test a word whether it could be accepted as a tag."""
if not tag:
return False
for char in ILLEGAL_TAG_CHARS:
if char in tag:
return False
return True
def validate_handle(handle):
"""
Very basic handle validation as per