diff --git a/CHANGELOG.md b/CHANGELOG.md index 90601be..e311c0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ * Add `process_text_links` text utility to linkify URL's in text. +* Add `find_tags` text utility to find hashtags from text. Optionally the function can + also replace the tags through a given `replacer` function. This utility is used + to improve the tag extraction logic from entities text fields. ([related issue](https://git.feneas.org/jaywink/federation/issues/70)) + ### Changed * The NodeInfo2 hostmeta parser now cleans the port out of the host name. diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py index 13f8ac4..3542a2d 100644 --- a/federation/entities/mixins.py +++ b/federation/entities/mixins.py @@ -8,7 +8,7 @@ from commonmark import commonmark from federation.entities.activitypub.enums import ActivityType from federation.entities.utils import get_name_for_profile -from federation.utils.text import process_text_links +from federation.utils.text import process_text_links, find_tags class BaseEntity: @@ -204,10 +204,27 @@ class RawContentMixin(BaseEntity): @property def rendered_content(self) -> str: """Returns the rendered version of raw_content, or just raw_content.""" + from federation.utils.django import get_configuration + try: + config = get_configuration() + if config["tags_path"]: + def linkifier(tag: str) -> str: + return f'' \ + f'#{tag}' + else: + linkifier = None + except ImportError: + linkifier = None + if self._rendered_content: return self._rendered_content elif self._media_type == "text/markdown" and self.raw_content: - rendered = commonmark(self.raw_content).strip() + # Do tags + _tags, rendered = find_tags(self.raw_content, replacer=linkifier) + # Render markdown to HTML + rendered = commonmark(rendered).strip() + # Do mentions if self._mentions: for mention in self._mentions: # Only linkify mentions that are URL's @@ -218,7 +235,7 @@ class RawContentMixin(BaseEntity): display_name = mention rendered = rendered.replace( "@{%s}" % mention, - f'@{display_name}', + f'@{display_name}', ) # Finally linkify remaining URL's that are not links rendered = process_text_links(rendered) @@ -230,7 +247,7 @@ class RawContentMixin(BaseEntity): """Returns a `list` of unique tags contained in `raw_content`.""" if not self.raw_content: return [] - tags = {word.strip("#").lower() for word in self.raw_content.split() if word.startswith("#") and len(word) > 1} + tags, _text = find_tags(self.raw_content) return sorted(tags) def extract_mentions(self): diff --git a/federation/entities/utils.py b/federation/entities/utils.py index 6126051..40a5c52 100644 --- a/federation/entities/utils.py +++ b/federation/entities/utils.py @@ -34,5 +34,5 @@ def get_name_for_profile(fid: str) -> Optional[str]: return profile.username else: return profile.name - except ImportError: + except Exception: pass diff --git a/federation/outbound.py b/federation/outbound.py index 6219423..9956929 100644 --- a/federation/outbound.py +++ b/federation/outbound.py @@ -2,6 +2,7 @@ import copy import importlib import json import logging +import traceback from typing import List, Dict, Union # noinspection PyPackageRequirements @@ -170,8 +171,10 @@ def handle_send( if isinstance(payload.get("object"), dict): payload["object"]["to"] = [fid] rendered_payload = json.dumps(payload).encode("utf-8") - except Exception as ex: - logger.error("handle_send - failed to generate payload for %s, %s: %s", fid, endpoint, ex) + except Exception: + logger.error( + "handle_send - failed to generate payload for %s, %s: %s", fid, endpoint, traceback.format_exc(), + ) continue payloads.append({ "auth": get_http_authentication(author_user.rsa_private_key, f"{author_user.id}#main-key"), diff --git a/federation/tests/entities/activitypub/test_entities.py b/federation/tests/entities/activitypub/test_entities.py index a8dcf76..87ec829 100644 --- a/federation/tests/entities/activitypub/test_entities.py +++ b/federation/tests/entities/activitypub/test_entities.py @@ -181,7 +181,7 @@ class TestEntitiesConvertToAS2: 'attributedTo': 'http://127.0.0.1:8000/profile/123456/', 'content': '

raw_content

\n

@{someone@localhost.local} @' - 'Bob Bobértson

', + 'Bob Bobértson

', 'published': '2019-04-27T00:00:00', 'inReplyTo': None, 'sensitive': False, @@ -234,7 +234,15 @@ class TestEntitiesConvertToAS2: 'id': 'http://127.0.0.1:8000/post/123456/', 'type': 'Note', 'attributedTo': 'http://127.0.0.1:8000/profile/123456/', - 'content': '

raw_content

\n

#foobar\n#barfoo

', + 'content': '

raw_content

\n' + '

#foobar\n' + '#barfoo

', 'published': '2019-04-27T00:00:00', 'inReplyTo': None, 'sensitive': False, diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index 32aecf4..6f0c1b1 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -1,4 +1,4 @@ -from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links +from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links, find_tags def test_decode_if_bytes(): @@ -11,6 +11,70 @@ def test_encode_if_text(): assert encode_if_text("foobar") == b"foobar" +class TestFindTags: + @staticmethod + def _replacer(text): + return f"#{text}/{text.lower()}" + + def test_all_tags_are_parsed_from_text(self): + source = "#starting and #MixED with some #line\nendings also tags can\n#start on new line" + tags, text = find_tags(source) + assert tags == {"starting", "mixed", "line", "start"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "#starting/starting and #MixED/mixed with some #line/line\nendings also tags can\n" \ + "#start/start on new line" + + def test_code_block_tags_ignored(self): + source = "foo\n```\n#code\n```\n#notcode\n\n #alsocode\n" + tags, text = find_tags(source) + assert tags == {"notcode"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n #alsocode\n" + + def test_endings_are_filtered_out(self): + source = "#parenthesis) #exp! #list]" + tags, text = find_tags(source) + assert tags == {"parenthesis", "exp", "list"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]" + + def test_finds_tags(self): + source = "#post **Foobar** #tag #OtherTag #third\n#fourth" + tags, text = find_tags(source) + assert tags == {"third", "fourth", "post", "othertag", "tag"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth" + + def test_postfixed_tags(self): + source = "#foo) #bar] #hoo, #hee." + tags, text = find_tags(source) + assert tags == {"foo", "bar", "hoo", "hee"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "#foo/foo) #bar/bar] #hoo/hoo, #hee/hee." + + def test_prefixed_tags(self): + source = "(#foo [#bar" + tags, text = find_tags(source) + assert tags == {"foo", "bar"} + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == "(#foo/foo [#bar/bar" + + def test_invalid_text_returns_no_tags(self): + source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \ + "#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd" + tags, text = find_tags(source) + assert tags == set() + assert text == source + tags, text = find_tags(source, replacer=self._replacer) + assert text == source + + class TestProcessTextLinks: def test_link_at_start_or_end(self): assert process_text_links('https://example.org example.org\nhttp://example.org') == \ diff --git a/federation/utils/text.py b/federation/utils/text.py index ad67973..8bfb02b 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -1,9 +1,12 @@ import re +from typing import Set, Tuple from urllib.parse import urlparse import bleach from bleach import callbacks +ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0" + def decode_if_bytes(text): try: @@ -19,6 +22,54 @@ def encode_if_text(text): return text +def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]: + """Find tags in text. + + Tries to ignore tags inside code blocks. + + Optionally, if passed a "replacer", will also replace the tag word with the result + of the replacer function called with the tag word. + + Returns a set of tags and the original or replaced text. + """ + found_tags = set() + lines = text.splitlines(keepends=True) + final_lines = [] + code_block = False + final_text = None + # Check each line separately + for line in lines: + final_words = [] + if line[0:3] == "```": + code_block = not code_block + if line.find("#") == -1 or line[0:4] == " " or code_block: + # Just add the whole line + final_lines.append(line) + continue + # Check each word separately + words = line.split(" ") + for word in words: + candidate = word.strip().strip("([]),.!?:") + if candidate.startswith("#"): + candidate = candidate.strip("#") + if test_tag(candidate.lower()): + found_tags.add(candidate.lower()) + if replacer: + try: + tag_word = word.replace("#%s" % candidate, replacer(candidate)) + final_words.append(tag_word) + except Exception: + final_words.append(word) + else: + final_words.append(word) + else: + final_words.append(word) + final_lines.append(" ".join(final_words)) + if replacer: + final_text = "".join(final_lines) + return found_tags, final_text or text + + def get_path_from_url(url: str) -> str: """ Return only the path part of an URL. @@ -50,6 +101,16 @@ def process_text_links(text): ) +def test_tag(tag: str) -> bool: + """Test a word whether it could be accepted as a tag.""" + if not tag: + return False + for char in ILLEGAL_TAG_CHARS: + if char in tag: + return False + return True + + def validate_handle(handle): """ Very basic handle validation as per