diff --git a/CHANGELOG.md b/CHANGELOG.md
index 90601be..e311c0f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,10 @@
* Add `process_text_links` text utility to linkify URL's in text.
+* Add `find_tags` text utility to find hashtags from text. Optionally the function can
+ also replace the tags through a given `replacer` function. This utility is used
+ to improve the tag extraction logic from entities text fields. ([related issue](https://git.feneas.org/jaywink/federation/issues/70))
+
### Changed
* The NodeInfo2 hostmeta parser now cleans the port out of the host name.
diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py
index 13f8ac4..3542a2d 100644
--- a/federation/entities/mixins.py
+++ b/federation/entities/mixins.py
@@ -8,7 +8,7 @@ from commonmark import commonmark
from federation.entities.activitypub.enums import ActivityType
from federation.entities.utils import get_name_for_profile
-from federation.utils.text import process_text_links
+from federation.utils.text import process_text_links, find_tags
class BaseEntity:
@@ -204,10 +204,27 @@ class RawContentMixin(BaseEntity):
@property
def rendered_content(self) -> str:
"""Returns the rendered version of raw_content, or just raw_content."""
+ from federation.utils.django import get_configuration
+ try:
+ config = get_configuration()
+ if config["tags_path"]:
+ def linkifier(tag: str) -> str:
+ return f'' \
+ f'#{tag}'
+ else:
+ linkifier = None
+ except ImportError:
+ linkifier = None
+
if self._rendered_content:
return self._rendered_content
elif self._media_type == "text/markdown" and self.raw_content:
- rendered = commonmark(self.raw_content).strip()
+ # Do tags
+ _tags, rendered = find_tags(self.raw_content, replacer=linkifier)
+ # Render markdown to HTML
+ rendered = commonmark(rendered).strip()
+ # Do mentions
if self._mentions:
for mention in self._mentions:
# Only linkify mentions that are URL's
@@ -218,7 +235,7 @@ class RawContentMixin(BaseEntity):
display_name = mention
rendered = rendered.replace(
"@{%s}" % mention,
- f'@{display_name}',
+ f'@{display_name}',
)
# Finally linkify remaining URL's that are not links
rendered = process_text_links(rendered)
@@ -230,7 +247,7 @@ class RawContentMixin(BaseEntity):
"""Returns a `list` of unique tags contained in `raw_content`."""
if not self.raw_content:
return []
- tags = {word.strip("#").lower() for word in self.raw_content.split() if word.startswith("#") and len(word) > 1}
+ tags, _text = find_tags(self.raw_content)
return sorted(tags)
def extract_mentions(self):
diff --git a/federation/entities/utils.py b/federation/entities/utils.py
index 6126051..40a5c52 100644
--- a/federation/entities/utils.py
+++ b/federation/entities/utils.py
@@ -34,5 +34,5 @@ def get_name_for_profile(fid: str) -> Optional[str]:
return profile.username
else:
return profile.name
- except ImportError:
+ except Exception:
pass
diff --git a/federation/outbound.py b/federation/outbound.py
index 6219423..9956929 100644
--- a/federation/outbound.py
+++ b/federation/outbound.py
@@ -2,6 +2,7 @@ import copy
import importlib
import json
import logging
+import traceback
from typing import List, Dict, Union
# noinspection PyPackageRequirements
@@ -170,8 +171,10 @@ def handle_send(
if isinstance(payload.get("object"), dict):
payload["object"]["to"] = [fid]
rendered_payload = json.dumps(payload).encode("utf-8")
- except Exception as ex:
- logger.error("handle_send - failed to generate payload for %s, %s: %s", fid, endpoint, ex)
+ except Exception:
+ logger.error(
+ "handle_send - failed to generate payload for %s, %s: %s", fid, endpoint, traceback.format_exc(),
+ )
continue
payloads.append({
"auth": get_http_authentication(author_user.rsa_private_key, f"{author_user.id}#main-key"),
diff --git a/federation/tests/entities/activitypub/test_entities.py b/federation/tests/entities/activitypub/test_entities.py
index a8dcf76..87ec829 100644
--- a/federation/tests/entities/activitypub/test_entities.py
+++ b/federation/tests/entities/activitypub/test_entities.py
@@ -181,7 +181,7 @@ class TestEntitiesConvertToAS2:
'attributedTo': 'http://127.0.0.1:8000/profile/123456/',
'content': '
raw_content
\n@{someone@localhost.local} @'
- 'Bob Bobértson
',
+ 'Bob Bobértson',
'published': '2019-04-27T00:00:00',
'inReplyTo': None,
'sensitive': False,
@@ -234,7 +234,15 @@ class TestEntitiesConvertToAS2:
'id': 'http://127.0.0.1:8000/post/123456/',
'type': 'Note',
'attributedTo': 'http://127.0.0.1:8000/profile/123456/',
- 'content': 'raw_content
\n#foobar\n#barfoo
',
+ 'content': 'raw_content
\n'
+ '#foobar\n'
+ '#barfoo
',
'published': '2019-04-27T00:00:00',
'inReplyTo': None,
'sensitive': False,
diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py
index 32aecf4..6f0c1b1 100644
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@@ -1,4 +1,4 @@
-from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links
+from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links, find_tags
def test_decode_if_bytes():
@@ -11,6 +11,70 @@ def test_encode_if_text():
assert encode_if_text("foobar") == b"foobar"
+class TestFindTags:
+ @staticmethod
+ def _replacer(text):
+ return f"#{text}/{text.lower()}"
+
+ def test_all_tags_are_parsed_from_text(self):
+ source = "#starting and #MixED with some #line\nendings also tags can\n#start on new line"
+ tags, text = find_tags(source)
+ assert tags == {"starting", "mixed", "line", "start"}
+ assert text == source
+ tags, text = find_tags(source, replacer=self._replacer)
+ assert text == "#starting/starting and #MixED/mixed with some #line/line\nendings also tags can\n" \
+ "#start/start on new line"
+
+ def test_code_block_tags_ignored(self):
+ source = "foo\n```\n#code\n```\n#notcode\n\n #alsocode\n"
+ tags, text = find_tags(source)
+ assert tags == {"notcode"}
+ assert text == source
+ tags, text = find_tags(source, replacer=self._replacer)
+ assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n #alsocode\n"
+
+ def test_endings_are_filtered_out(self):
+ source = "#parenthesis) #exp! #list]"
+ tags, text = find_tags(source)
+ assert tags == {"parenthesis", "exp", "list"}
+ assert text == source
+ tags, text = find_tags(source, replacer=self._replacer)
+ assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]"
+
+ def test_finds_tags(self):
+ source = "#post **Foobar** #tag #OtherTag #third\n#fourth"
+ tags, text = find_tags(source)
+ assert tags == {"third", "fourth", "post", "othertag", "tag"}
+ assert text == source
+ tags, text = find_tags(source, replacer=self._replacer)
+ assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth"
+
+ def test_postfixed_tags(self):
+ source = "#foo) #bar] #hoo, #hee."
+ tags, text = find_tags(source)
+ assert tags == {"foo", "bar", "hoo", "hee"}
+ assert text == source
+ tags, text = find_tags(source, replacer=self._replacer)
+ assert text == "#foo/foo) #bar/bar] #hoo/hoo, #hee/hee."
+
+ def test_prefixed_tags(self):
+ source = "(#foo [#bar"
+ tags, text = find_tags(source)
+ assert tags == {"foo", "bar"}
+ assert text == source
+ tags, text = find_tags(source, replacer=self._replacer)
+ assert text == "(#foo/foo [#bar/bar"
+
+ def test_invalid_text_returns_no_tags(self):
+ source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \
+ "#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd"
+ tags, text = find_tags(source)
+ assert tags == set()
+ assert text == source
+ tags, text = find_tags(source, replacer=self._replacer)
+ assert text == source
+
+
class TestProcessTextLinks:
def test_link_at_start_or_end(self):
assert process_text_links('https://example.org example.org\nhttp://example.org') == \
diff --git a/federation/utils/text.py b/federation/utils/text.py
index ad67973..8bfb02b 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -1,9 +1,12 @@
import re
+from typing import Set, Tuple
from urllib.parse import urlparse
import bleach
from bleach import callbacks
+ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
+
def decode_if_bytes(text):
try:
@@ -19,6 +22,54 @@ def encode_if_text(text):
return text
+def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
+ """Find tags in text.
+
+ Tries to ignore tags inside code blocks.
+
+ Optionally, if passed a "replacer", will also replace the tag word with the result
+ of the replacer function called with the tag word.
+
+ Returns a set of tags and the original or replaced text.
+ """
+ found_tags = set()
+ lines = text.splitlines(keepends=True)
+ final_lines = []
+ code_block = False
+ final_text = None
+ # Check each line separately
+ for line in lines:
+ final_words = []
+ if line[0:3] == "```":
+ code_block = not code_block
+ if line.find("#") == -1 or line[0:4] == " " or code_block:
+ # Just add the whole line
+ final_lines.append(line)
+ continue
+ # Check each word separately
+ words = line.split(" ")
+ for word in words:
+ candidate = word.strip().strip("([]),.!?:")
+ if candidate.startswith("#"):
+ candidate = candidate.strip("#")
+ if test_tag(candidate.lower()):
+ found_tags.add(candidate.lower())
+ if replacer:
+ try:
+ tag_word = word.replace("#%s" % candidate, replacer(candidate))
+ final_words.append(tag_word)
+ except Exception:
+ final_words.append(word)
+ else:
+ final_words.append(word)
+ else:
+ final_words.append(word)
+ final_lines.append(" ".join(final_words))
+ if replacer:
+ final_text = "".join(final_lines)
+ return found_tags, final_text or text
+
+
def get_path_from_url(url: str) -> str:
"""
Return only the path part of an URL.
@@ -50,6 +101,16 @@ def process_text_links(text):
)
+def test_tag(tag: str) -> bool:
+ """Test a word whether it could be accepted as a tag."""
+ if not tag:
+ return False
+ for char in ILLEGAL_TAG_CHARS:
+ if char in tag:
+ return False
+ return True
+
+
def validate_handle(handle):
"""
Very basic handle validation as per