kopia lustrzana https://gitlab.com/jaywink/federation
Add `find_tags` text utility to find hashtags from text
Refs: https://git.feneas.org/socialhome/socialhome/issues/572merge-requests/159/head
rodzic
3678c520dd
commit
712c6d2c46
|
@ -11,6 +11,8 @@
|
||||||
|
|
||||||
* Add `process_text_links` text utility to linkify URL's in text.
|
* Add `process_text_links` text utility to linkify URL's in text.
|
||||||
|
|
||||||
|
* Add `find_tags` text utility to find hashtags from text.
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
* The NodeInfo2 hostmeta parser now cleans the port out of the host name.
|
* The NodeInfo2 hostmeta parser now cleans the port out of the host name.
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links
|
from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links, find_tags
|
||||||
|
|
||||||
|
|
||||||
def test_decode_if_bytes():
|
def test_decode_if_bytes():
|
||||||
|
@ -11,6 +11,36 @@ def test_encode_if_text():
|
||||||
assert encode_if_text("foobar") == b"foobar"
|
assert encode_if_text("foobar") == b"foobar"
|
||||||
|
|
||||||
|
|
||||||
|
class TestFindTags:
|
||||||
|
def test_factory_instance_has_tags(self):
|
||||||
|
assert find_tags("**Foobar** #tag #othertag") == {"tag", "othertag"}
|
||||||
|
|
||||||
|
def test_extract_tags_adds_new_tags(self):
|
||||||
|
assert find_tags("#post **Foobar** #tag #OtherTag #third\n#fourth") == {
|
||||||
|
"third", "fourth", "post", "othertag", "tag",
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_all_tags_are_parsed_from_text(self):
|
||||||
|
assert find_tags("#starting and #MixED with some #line\nendings also tags can\n#start on new line") == \
|
||||||
|
{"starting", "mixed", "line", "start"}
|
||||||
|
|
||||||
|
def test_invalid_text_returns_no_tags(self):
|
||||||
|
assert find_tags("#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a #a?a #a`a "
|
||||||
|
"#a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd") == set()
|
||||||
|
|
||||||
|
def test_endings_are_filtered_out(self):
|
||||||
|
assert find_tags("#parenthesis) #exp! #list]") == {"parenthesis", "exp", "list"}
|
||||||
|
|
||||||
|
def test_prefixed_tags(self):
|
||||||
|
assert find_tags("(#foo [#bar") == {"foo", "bar"}
|
||||||
|
|
||||||
|
def test_postfixed_tags(self):
|
||||||
|
assert find_tags("#foo) #bar] #hoo, #hee.") == {"foo", "bar", "hoo", "hee"}
|
||||||
|
|
||||||
|
def test_code_block_tags_ignored(self):
|
||||||
|
assert find_tags("foo\n```\n#code\n```\n#notcode\n\n #alsocode\n") == {"notcode"}
|
||||||
|
|
||||||
|
|
||||||
class TestProcessTextLinks:
|
class TestProcessTextLinks:
|
||||||
def test_link_at_start_or_end(self):
|
def test_link_at_start_or_end(self):
|
||||||
assert process_text_links('https://example.org example.org\nhttp://example.org') == \
|
assert process_text_links('https://example.org example.org\nhttp://example.org') == \
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
import re
|
import re
|
||||||
|
from typing import Set
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import bleach
|
import bleach
|
||||||
from bleach import callbacks
|
from bleach import callbacks
|
||||||
|
|
||||||
|
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
|
||||||
|
|
||||||
|
|
||||||
def decode_if_bytes(text):
|
def decode_if_bytes(text):
|
||||||
try:
|
try:
|
||||||
|
@ -19,6 +22,31 @@ def encode_if_text(text):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def find_tags(text: str) -> Set:
|
||||||
|
"""Find tags in text.
|
||||||
|
|
||||||
|
Tries to ignore tags inside code blocks.
|
||||||
|
"""
|
||||||
|
found_tags = set()
|
||||||
|
lines = text.splitlines(keepends=True)
|
||||||
|
code_block = False
|
||||||
|
# Check each line separately
|
||||||
|
for line in lines:
|
||||||
|
if line[0:3] == "```":
|
||||||
|
code_block = not code_block
|
||||||
|
if line.find("#") == -1 or line[0:4] == " " or code_block:
|
||||||
|
continue
|
||||||
|
# Check each word separately
|
||||||
|
words = line.split(" ")
|
||||||
|
for word in words:
|
||||||
|
candidate = word.strip().strip("([]),.!?:")
|
||||||
|
if candidate.startswith("#"):
|
||||||
|
candidate = candidate.strip("#")
|
||||||
|
if test_tag(candidate.lower()):
|
||||||
|
found_tags.add(candidate.lower())
|
||||||
|
return found_tags
|
||||||
|
|
||||||
|
|
||||||
def get_path_from_url(url: str) -> str:
|
def get_path_from_url(url: str) -> str:
|
||||||
"""
|
"""
|
||||||
Return only the path part of an URL.
|
Return only the path part of an URL.
|
||||||
|
@ -50,6 +78,16 @@ def process_text_links(text):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tag(tag: str) -> bool:
|
||||||
|
"""Test a word whether it could be accepted as a tag."""
|
||||||
|
if not tag:
|
||||||
|
return False
|
||||||
|
for char in ILLEGAL_TAG_CHARS:
|
||||||
|
if char in tag:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def validate_handle(handle):
|
def validate_handle(handle):
|
||||||
"""
|
"""
|
||||||
Very basic handle validation as per
|
Very basic handle validation as per
|
||||||
|
|
Ładowanie…
Reference in New Issue