From 9cb2509ab6cb75c4a8596a8bd7f5e70d48806aff Mon Sep 17 00:00:00 2001 From: Jason Robinson Date: Mon, 17 Feb 2020 21:58:00 +0200 Subject: [PATCH] Add replacer functionality to the find_tags utility --- CHANGELOG.md | 3 +- federation/entities/mixins.py | 2 +- federation/tests/utils/test_text.py | 80 ++++++++++++++++++++--------- federation/utils/text.py | 29 +++++++++-- 4 files changed, 86 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ee6bf8..e311c0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,8 @@ * Add `process_text_links` text utility to linkify URL's in text. -* Add `find_tags` text utility to find hashtags from text. This utility is used +* Add `find_tags` text utility to find hashtags from text. Optionally the function can + also replace the tags through a given `replacer` function. This utility is used to improve the tag extraction logic from entities text fields. ([related issue](https://git.feneas.org/jaywink/federation/issues/70)) ### Changed diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py index c7a6a1c..4a00ba4 100644 --- a/federation/entities/mixins.py +++ b/federation/entities/mixins.py @@ -230,7 +230,7 @@ class RawContentMixin(BaseEntity): """Returns a `list` of unique tags contained in `raw_content`.""" if not self.raw_content: return [] - tags = find_tags(self.raw_content) + tags, _text = find_tags(self.raw_content) return sorted(tags) def extract_mentions(self): diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index 61c10fc..6f0c1b1 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -12,33 +12,67 @@ def test_encode_if_text(): class TestFindTags: - def test_factory_instance_has_tags(self): - assert find_tags("**Foobar** #tag #othertag") == {"tag", "othertag"} - - def test_extract_tags_adds_new_tags(self): - assert find_tags("#post **Foobar** #tag #OtherTag #third\n#fourth") == { - "third", "fourth", "post", "othertag", "tag", - } + @staticmethod + def _replacer(text): + return f"#{text}/{text.lower()}" def test_all_tags_are_parsed_from_text(self): - assert find_tags("#starting and #MixED with some #line\nendings also tags can\n#start on new line") == \ - {"starting", "mixed", "line", "start"} - - def test_invalid_text_returns_no_tags(self): - assert find_tags("#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a #a?a #a`a " - "#a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd") == set() - - def test_endings_are_filtered_out(self): - assert find_tags("#parenthesis) #exp! #list]") == {"parenthesis", "exp", "list"} - - def test_prefixed_tags(self): - assert find_tags("(#foo [#bar") == {"foo", "bar"} - - def test_postfixed_tags(self): - assert find_tags("#foo) #bar] #hoo, #hee.") == {"foo", "bar", "hoo", "hee"} + source = "#starting and #MixED with some #line\nendings also tags can\n#start on new line" + tags, text = find_tags(source) + assert tags == {"starting", "mixed", "line", "start"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "#starting/starting and #MixED/mixed with some #line/line\nendings also tags can\n" \ + "#start/start on new line" def test_code_block_tags_ignored(self): - assert find_tags("foo\n```\n#code\n```\n#notcode\n\n #alsocode\n") == {"notcode"} + source = "foo\n```\n#code\n```\n#notcode\n\n #alsocode\n" + tags, text = find_tags(source) + assert tags == {"notcode"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n #alsocode\n" + + def test_endings_are_filtered_out(self): + source = "#parenthesis) #exp! #list]" + tags, text = find_tags(source) + assert tags == {"parenthesis", "exp", "list"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]" + + def test_finds_tags(self): + source = "#post **Foobar** #tag #OtherTag #third\n#fourth" + tags, text = find_tags(source) + assert tags == {"third", "fourth", "post", "othertag", "tag"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth" + + def test_postfixed_tags(self): + source = "#foo) #bar] #hoo, #hee." + tags, text = find_tags(source) + assert tags == {"foo", "bar", "hoo", "hee"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "#foo/foo) #bar/bar] #hoo/hoo, #hee/hee." + + def test_prefixed_tags(self): + source = "(#foo [#bar" + tags, text = find_tags(source) + assert tags == {"foo", "bar"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "(#foo/foo [#bar/bar" + + def test_invalid_text_returns_no_tags(self): + source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \ + "#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd" + tags, text = find_tags(source) + assert tags == set() + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == source class TestProcessTextLinks: diff --git a/federation/utils/text.py b/federation/utils/text.py index aff0699..8bfb02b 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -1,5 +1,5 @@ import re -from typing import Set +from typing import Set, Tuple from urllib.parse import urlparse import bleach @@ -22,19 +22,29 @@ def encode_if_text(text): return text -def find_tags(text: str) -> Set: +def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]: """Find tags in text. Tries to ignore tags inside code blocks. + + Optionally, if passed a "replacer", will also replace the tag word with the result + of the replacer function called with the tag word. + + Returns a set of tags and the original or replaced text. """ found_tags = set() lines = text.splitlines(keepends=True) + final_lines = [] code_block = False + final_text = None # Check each line separately for line in lines: + final_words = [] if line[0:3] == "```": code_block = not code_block if line.find("#") == -1 or line[0:4] == " " or code_block: + # Just add the whole line + final_lines.append(line) continue # Check each word separately words = line.split(" ") @@ -44,7 +54,20 @@ def find_tags(text: str) -> Set: candidate = candidate.strip("#") if test_tag(candidate.lower()): found_tags.add(candidate.lower()) - return found_tags + if replacer: + try: + tag_word = word.replace("#%s" % candidate, replacer(candidate)) + final_words.append(tag_word) + except Exception: + final_words.append(word) + else: + final_words.append(word) + else: + final_words.append(word) + final_lines.append(" ".join(final_words)) + if replacer: + final_text = "".join(final_lines) + return found_tags, final_text or text def get_path_from_url(url: str) -> str: