Add replacer functionality to the find_tags utility

merge-requests/159/head
Jason Robinson 2020-02-17 21:58:00 +02:00
rodzic b72ce5b870
commit 9cb2509ab6
4 zmienionych plików z 86 dodań i 28 usunięć

Wyświetl plik

@ -11,7 +11,8 @@
* Add `process_text_links` text utility to linkify URL's in text.
* Add `find_tags` text utility to find hashtags from text. This utility is used
* Add `find_tags` text utility to find hashtags from text. Optionally the function can
also replace the tags through a given `replacer` function. This utility is used
to improve the tag extraction logic from entities text fields. ([related issue](https://git.feneas.org/jaywink/federation/issues/70))
### Changed

Wyświetl plik

@ -230,7 +230,7 @@ class RawContentMixin(BaseEntity):
"""Returns a `list` of unique tags contained in `raw_content`."""
if not self.raw_content:
return []
tags = find_tags(self.raw_content)
tags, _text = find_tags(self.raw_content)
return sorted(tags)
def extract_mentions(self):

Wyświetl plik

@ -12,33 +12,67 @@ def test_encode_if_text():
class TestFindTags:
def test_factory_instance_has_tags(self):
assert find_tags("**Foobar** #tag #othertag") == {"tag", "othertag"}
def test_extract_tags_adds_new_tags(self):
assert find_tags("#post **Foobar** #tag #OtherTag #third\n#fourth") == {
"third", "fourth", "post", "othertag", "tag",
}
@staticmethod
def _replacer(text):
return f"#{text}/{text.lower()}"
def test_all_tags_are_parsed_from_text(self):
assert find_tags("#starting and #MixED with some #line\nendings also tags can\n#start on new line") == \
{"starting", "mixed", "line", "start"}
def test_invalid_text_returns_no_tags(self):
assert find_tags("#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a #a?a #a`a "
"#a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #aa #a”a #\xa0cd") == set()
def test_endings_are_filtered_out(self):
assert find_tags("#parenthesis) #exp! #list]") == {"parenthesis", "exp", "list"}
def test_prefixed_tags(self):
assert find_tags("(#foo [#bar") == {"foo", "bar"}
def test_postfixed_tags(self):
assert find_tags("#foo) #bar] #hoo, #hee.") == {"foo", "bar", "hoo", "hee"}
source = "#starting and #MixED with some #line\nendings also tags can\n#start on new line"
tags, text = find_tags(source)
assert tags == {"starting", "mixed", "line", "start"}
assert text == source
tags, text = find_tags(source, replacer=self._replacer)
assert text == "#starting/starting and #MixED/mixed with some #line/line\nendings also tags can\n" \
"#start/start on new line"
def test_code_block_tags_ignored(self):
assert find_tags("foo\n```\n#code\n```\n#notcode\n\n #alsocode\n") == {"notcode"}
source = "foo\n```\n#code\n```\n#notcode\n\n #alsocode\n"
tags, text = find_tags(source)
assert tags == {"notcode"}
assert text == source
tags, text = find_tags(source, replacer=self._replacer)
assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n #alsocode\n"
def test_endings_are_filtered_out(self):
source = "#parenthesis) #exp! #list]"
tags, text = find_tags(source)
assert tags == {"parenthesis", "exp", "list"}
assert text == source
tags, text = find_tags(source, replacer=self._replacer)
assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]"
def test_finds_tags(self):
source = "#post **Foobar** #tag #OtherTag #third\n#fourth"
tags, text = find_tags(source)
assert tags == {"third", "fourth", "post", "othertag", "tag"}
assert text == source
tags, text = find_tags(source, replacer=self._replacer)
assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth"
def test_postfixed_tags(self):
source = "#foo) #bar] #hoo, #hee."
tags, text = find_tags(source)
assert tags == {"foo", "bar", "hoo", "hee"}
assert text == source
tags, text = find_tags(source, replacer=self._replacer)
assert text == "#foo/foo) #bar/bar] #hoo/hoo, #hee/hee."
def test_prefixed_tags(self):
source = "(#foo [#bar"
tags, text = find_tags(source)
assert tags == {"foo", "bar"}
assert text == source
tags, text = find_tags(source, replacer=self._replacer)
assert text == "(#foo/foo [#bar/bar"
def test_invalid_text_returns_no_tags(self):
source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \
"#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #aa #a”a #\xa0cd"
tags, text = find_tags(source)
assert tags == set()
assert text == source
tags, text = find_tags(source, replacer=self._replacer)
assert text == source
class TestProcessTextLinks:

Wyświetl plik

@ -1,5 +1,5 @@
import re
from typing import Set
from typing import Set, Tuple
from urllib.parse import urlparse
import bleach
@ -22,19 +22,29 @@ def encode_if_text(text):
return text
def find_tags(text: str) -> Set:
def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
"""Find tags in text.
Tries to ignore tags inside code blocks.
Optionally, if passed a "replacer", will also replace the tag word with the result
of the replacer function called with the tag word.
Returns a set of tags and the original or replaced text.
"""
found_tags = set()
lines = text.splitlines(keepends=True)
final_lines = []
code_block = False
final_text = None
# Check each line separately
for line in lines:
final_words = []
if line[0:3] == "```":
code_block = not code_block
if line.find("#") == -1 or line[0:4] == " " or code_block:
# Just add the whole line
final_lines.append(line)
continue
# Check each word separately
words = line.split(" ")
@ -44,7 +54,20 @@ def find_tags(text: str) -> Set:
candidate = candidate.strip("#")
if test_tag(candidate.lower()):
found_tags.add(candidate.lower())
return found_tags
if replacer:
try:
tag_word = word.replace("#%s" % candidate, replacer(candidate))
final_words.append(tag_word)
except Exception:
final_words.append(word)
else:
final_words.append(word)
else:
final_words.append(word)
final_lines.append(" ".join(final_words))
if replacer:
final_text = "".join(final_lines)
return found_tags, final_text or text
def get_path_from_url(url: str) -> str: