Merge branch 'find-tags' into 'master'

Various improvements to outbound HTML payload rendering for ActivityPub networks Closes socialhome/socialhome#571 and #70 See merge request jaywink/federation!159
2020-02-17 22:10:16 +00:00 · 2020-02-17 22:10:16 +00:00 · b2a214ec81
commit b2a214ec81
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,6 +11,10 @@
  
 * Add `process_text_links` text utility to linkify URL's in text.

+* Add `find_tags` text utility to find hashtags from text. Optionally the function can
+  also replace the tags through a given `replacer` function. This utility is used
+  to improve the tag extraction logic from entities text fields. ([related issue](https://git.feneas.org/jaywink/federation/issues/70))
+
 ### Changed

 * The NodeInfo2 hostmeta parser now cleans the port out of the host name.
--- a/federation/entities/mixins.py
+++ b/federation/entities/mixins.py
@ -8,7 +8,7 @@ from commonmark import commonmark

 from federation.entities.activitypub.enums import ActivityType
 from federation.entities.utils import get_name_for_profile
-from federation.utils.text import process_text_links
+from federation.utils.text import process_text_links, find_tags


 class BaseEntity:
@ -204,10 +204,27 @@ class RawContentMixin(BaseEntity):
    @property
    def rendered_content(self) -> str:
        """Returns the rendered version of raw_content, or just raw_content."""
+        from federation.utils.django import get_configuration
+        try:
+            config = get_configuration()
+            if config["tags_path"]:
+                def linkifier(tag: str) -> str:
+                    return f'<a href="{config["base_url"]}{config["tags_path"].replace(":tag:", tag.lower())}" ' \
+                           f'class="mention hashtag" rel="noopener noreferrer">' \
+                           f'#<span>{tag}</span></a>'
+            else:
+                linkifier = None
+        except ImportError:
+            linkifier = None
+
        if self._rendered_content:
            return self._rendered_content
        elif self._media_type == "text/markdown" and self.raw_content:
-            rendered = commonmark(self.raw_content).strip()
+            # Do tags
+            _tags, rendered = find_tags(self.raw_content, replacer=linkifier)
+            # Render markdown to HTML
+            rendered = commonmark(rendered).strip()
+            # Do mentions
            if self._mentions:
                for mention in self._mentions:
                    # Only linkify mentions that are URL's
@ -218,7 +235,7 @@ class RawContentMixin(BaseEntity):
                        display_name = mention
                    rendered = rendered.replace(
                        "@{%s}" % mention,
-                        f'@<a href="{mention}" class="mention">{display_name}</a>',
+                        f'@<a href="{mention}" class="mention"><span>{display_name}</span></a>',
                    )
            # Finally linkify remaining URL's that are not links
            rendered = process_text_links(rendered)
@ -230,7 +247,7 @@ class RawContentMixin(BaseEntity):
        """Returns a `list` of unique tags contained in `raw_content`."""
        if not self.raw_content:
            return []
-        tags = {word.strip("#").lower() for word in self.raw_content.split() if word.startswith("#") and len(word) > 1}
+        tags, _text = find_tags(self.raw_content)
        return sorted(tags)

    def extract_mentions(self):
--- a/federation/entities/utils.py
+++ b/federation/entities/utils.py
@ -34,5 +34,5 @@ def get_name_for_profile(fid: str) -> Optional[str]:
            return profile.username
        else:
            return profile.name
-    except ImportError:
+    except Exception:
        pass
--- a/federation/outbound.py
+++ b/federation/outbound.py
@ -2,6 +2,7 @@ import copy
 import importlib
 import json
 import logging
+import traceback
 from typing import List, Dict, Union

 # noinspection PyPackageRequirements
@ -170,8 +171,10 @@ def handle_send(
                    if isinstance(payload.get("object"), dict):
                        payload["object"]["to"] = [fid]
                rendered_payload = json.dumps(payload).encode("utf-8")
-            except Exception as ex:
-                logger.error("handle_send - failed to generate payload for %s, %s: %s", fid, endpoint, ex)
+            except Exception:
+                logger.error(
+                    "handle_send - failed to generate payload for %s, %s: %s", fid, endpoint, traceback.format_exc(),
+                )
                continue
            payloads.append({
                "auth": get_http_authentication(author_user.rsa_private_key, f"{author_user.id}#main-key"),
--- a/federation/tests/entities/activitypub/test_entities.py
+++ b/federation/tests/entities/activitypub/test_entities.py
@ -181,7 +181,7 @@ class TestEntitiesConvertToAS2:
                'attributedTo': 'http://127.0.0.1:8000/profile/123456/',
                'content': '<h1>raw_content</h1>\n<p>@{someone@localhost.local} @<a class="mention" '
                           'href="http://localhost.local/someone" rel="nofollow" target="_blank">'
-                           'Bob Bobértson</a></p>',
+                           '<span>Bob Bobértson</span></a></p>',
                'published': '2019-04-27T00:00:00',
                'inReplyTo': None,
                'sensitive': False,
@ -234,7 +234,15 @@ class TestEntitiesConvertToAS2:
                'id': 'http://127.0.0.1:8000/post/123456/',
                'type': 'Note',
                'attributedTo': 'http://127.0.0.1:8000/profile/123456/',
-                'content': '<h1>raw_content</h1>\n<p>#foobar\n#barfoo</p>',
+                'content': '<h1>raw_content</h1>\n'
+                           '<p><a class="mention hashtag" '
+                           'href="https://example.com/tag/foobar/" rel="noopener '
+                           'noreferrer nofollow" '
+                           'target="_blank">#<span>foobar</span></a>\n'
+                           '<a class="mention hashtag" '
+                           'href="https://example.com/tag/barfoo/" rel="noopener '
+                           'noreferrer nofollow" '
+                           'target="_blank">#<span>barfoo</span></a></p>',
                'published': '2019-04-27T00:00:00',
                'inReplyTo': None,
                'sensitive': False,
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@ -1,4 +1,4 @@
-from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links
+from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links, find_tags


 def test_decode_if_bytes():
@ -11,6 +11,70 @@ def test_encode_if_text():
    assert encode_if_text("foobar") == b"foobar"


+class TestFindTags:
+    @staticmethod
+    def _replacer(text):
+        return f"#{text}/{text.lower()}"
+
+    def test_all_tags_are_parsed_from_text(self):
+        source = "#starting and #MixED with some #line\nendings also tags can\n#start on new line"
+        tags, text = find_tags(source)
+        assert tags == {"starting", "mixed", "line", "start"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#starting/starting and #MixED/mixed with some #line/line\nendings also tags can\n" \
+                       "#start/start on new line"
+
+    def test_code_block_tags_ignored(self):
+        source = "foo\n```\n#code\n```\n#notcode\n\n    #alsocode\n"
+        tags, text = find_tags(source)
+        assert tags == {"notcode"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n    #alsocode\n"
+
+    def test_endings_are_filtered_out(self):
+        source = "#parenthesis) #exp! #list]"
+        tags, text = find_tags(source)
+        assert tags == {"parenthesis", "exp", "list"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#parenthesis/parenthesis) #exp/exp! #list/list]"
+
+    def test_finds_tags(self):
+        source = "#post **Foobar** #tag #OtherTag #third\n#fourth"
+        tags, text = find_tags(source)
+        assert tags == {"third", "fourth", "post", "othertag", "tag"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth"
+
+    def test_postfixed_tags(self):
+        source = "#foo) #bar] #hoo, #hee."
+        tags, text = find_tags(source)
+        assert tags == {"foo", "bar", "hoo", "hee"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "#foo/foo) #bar/bar] #hoo/hoo, #hee/hee."
+
+    def test_prefixed_tags(self):
+        source = "(#foo [#bar"
+        tags, text = find_tags(source)
+        assert tags == {"foo", "bar"}
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == "(#foo/foo [#bar/bar"
+
+    def test_invalid_text_returns_no_tags(self):
+        source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a/a #a(a #a)a #a=a " \
+                 "#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd"
+        tags, text = find_tags(source)
+        assert tags == set()
+        assert text == source
+        tags, text = find_tags(source, replacer=self._replacer)
+        assert text == source
+
+
 class TestProcessTextLinks:
    def test_link_at_start_or_end(self):
        assert process_text_links('https://example.org example.org\nhttp://example.org') == \
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@ -1,9 +1,12 @@
 import re
+from typing import Set, Tuple
 from urllib.parse import urlparse

 import bleach
 from bleach import callbacks

+ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
+

 def decode_if_bytes(text):
    try:
@ -19,6 +22,54 @@ def encode_if_text(text):
        return text


+def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
+    """Find tags in text.
+
+    Tries to ignore tags inside code blocks.
+
+    Optionally, if passed a "replacer", will also replace the tag word with the result
+    of the replacer function called with the tag word.
+
+    Returns a set of tags and the original or replaced text.
+    """
+    found_tags = set()
+    lines = text.splitlines(keepends=True)
+    final_lines = []
+    code_block = False
+    final_text = None
+    # Check each line separately
+    for line in lines:
+        final_words = []
+        if line[0:3] == "```":
+            code_block = not code_block
+        if line.find("#") == -1 or line[0:4] == "    " or code_block:
+            # Just add the whole line
+            final_lines.append(line)
+            continue
+        # Check each word separately
+        words = line.split(" ")
+        for word in words:
+            candidate = word.strip().strip("([]),.!?:")
+            if candidate.startswith("#"):
+                candidate = candidate.strip("#")
+                if test_tag(candidate.lower()):
+                    found_tags.add(candidate.lower())
+                    if replacer:
+                        try:
+                            tag_word = word.replace("#%s" % candidate, replacer(candidate))
+                            final_words.append(tag_word)
+                        except Exception:
+                            final_words.append(word)
+                else:
+                    final_words.append(word)
+            else:
+                final_words.append(word)
+        final_lines.append(" ".join(final_words))
+    if replacer:
+        final_text = "".join(final_lines)
+    return found_tags, final_text or text
+
+
 def get_path_from_url(url: str) -> str:
    """
    Return only the path part of an URL.
@ -50,6 +101,16 @@ def process_text_links(text):
    )


+def test_tag(tag: str) -> bool:
+    """Test a word whether it could be accepted as a tag."""
+    if not tag:
+        return False
+    for char in ILLEGAL_TAG_CHARS:
+        if char in tag:
+            return False
+    return True
+
+
 def validate_handle(handle):
    """
    Very basic handle validation as per