kopia lustrzana https://gitlab.com/jaywink/federation
Add `find_tags` text utility to find hashtags from text
Refs: https://git.feneas.org/socialhome/socialhome/issues/572merge-requests/159/head
rodzic
3678c520dd
commit
712c6d2c46
|
@ -11,6 +11,8 @@
|
|||
|
||||
* Add `process_text_links` text utility to linkify URL's in text.
|
||||
|
||||
* Add `find_tags` text utility to find hashtags from text.
|
||||
|
||||
### Changed
|
||||
|
||||
* The NodeInfo2 hostmeta parser now cleans the port out of the host name.
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links
|
||||
from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links, find_tags
|
||||
|
||||
|
||||
def test_decode_if_bytes():
|
||||
|
@ -11,6 +11,36 @@ def test_encode_if_text():
|
|||
assert encode_if_text("foobar") == b"foobar"
|
||||
|
||||
|
||||
class TestFindTags:
|
||||
def test_factory_instance_has_tags(self):
|
||||
assert find_tags("**Foobar** #tag #othertag") == {"tag", "othertag"}
|
||||
|
||||
def test_extract_tags_adds_new_tags(self):
|
||||
assert find_tags("#post **Foobar** #tag #OtherTag #third\n#fourth") == {
|
||||
"third", "fourth", "post", "othertag", "tag",
|
||||
}
|
||||
|
||||
def test_all_tags_are_parsed_from_text(self):
|
||||
assert find_tags("#starting and #MixED with some #line\nendings also tags can\n#start on new line") == \
|
||||
{"starting", "mixed", "line", "start"}
|
||||
|
||||
def test_invalid_text_returns_no_tags(self):
|
||||
assert find_tags("#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a #a?a #a`a "
|
||||
"#a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd") == set()
|
||||
|
||||
def test_endings_are_filtered_out(self):
|
||||
assert find_tags("#parenthesis) #exp! #list]") == {"parenthesis", "exp", "list"}
|
||||
|
||||
def test_prefixed_tags(self):
|
||||
assert find_tags("(#foo [#bar") == {"foo", "bar"}
|
||||
|
||||
def test_postfixed_tags(self):
|
||||
assert find_tags("#foo) #bar] #hoo, #hee.") == {"foo", "bar", "hoo", "hee"}
|
||||
|
||||
def test_code_block_tags_ignored(self):
|
||||
assert find_tags("foo\n```\n#code\n```\n#notcode\n\n #alsocode\n") == {"notcode"}
|
||||
|
||||
|
||||
class TestProcessTextLinks:
|
||||
def test_link_at_start_or_end(self):
|
||||
assert process_text_links('https://example.org example.org\nhttp://example.org') == \
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
import re
|
||||
from typing import Set
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import bleach
|
||||
from bleach import callbacks
|
||||
|
||||
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
|
||||
|
||||
|
||||
def decode_if_bytes(text):
|
||||
try:
|
||||
|
@ -19,6 +22,31 @@ def encode_if_text(text):
|
|||
return text
|
||||
|
||||
|
||||
def find_tags(text: str) -> Set:
|
||||
"""Find tags in text.
|
||||
|
||||
Tries to ignore tags inside code blocks.
|
||||
"""
|
||||
found_tags = set()
|
||||
lines = text.splitlines(keepends=True)
|
||||
code_block = False
|
||||
# Check each line separately
|
||||
for line in lines:
|
||||
if line[0:3] == "```":
|
||||
code_block = not code_block
|
||||
if line.find("#") == -1 or line[0:4] == " " or code_block:
|
||||
continue
|
||||
# Check each word separately
|
||||
words = line.split(" ")
|
||||
for word in words:
|
||||
candidate = word.strip().strip("([]),.!?:")
|
||||
if candidate.startswith("#"):
|
||||
candidate = candidate.strip("#")
|
||||
if test_tag(candidate.lower()):
|
||||
found_tags.add(candidate.lower())
|
||||
return found_tags
|
||||
|
||||
|
||||
def get_path_from_url(url: str) -> str:
|
||||
"""
|
||||
Return only the path part of an URL.
|
||||
|
@ -50,6 +78,16 @@ def process_text_links(text):
|
|||
)
|
||||
|
||||
|
||||
def test_tag(tag: str) -> bool:
|
||||
"""Test a word whether it could be accepted as a tag."""
|
||||
if not tag:
|
||||
return False
|
||||
for char in ILLEGAL_TAG_CHARS:
|
||||
if char in tag:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def validate_handle(handle):
|
||||
"""
|
||||
Very basic handle validation as per
|
||||
|
|
Ładowanie…
Reference in New Issue