From 0f6bd920c34f43f99327b8d3a4b3c9a3df2d1879 Mon Sep 17 00:00:00 2001
From: Denis Laxalde <denis@laxalde.org>
Date: Tue, 1 Jan 2019 22:55:49 +0100
Subject: [PATCH] Replace &apos; by "'" before parsing HTML

Beautiful will does not parse HTML entities like `&apos;` as we expect
and the previous logic of replacing this *after* HTML parsing occurred
did not produced expected results.

To illustrate this, we change data in "test_timeline" to include a
literal `&apos;` as it sometimes occur in data returned by Mastodon API.
New HTML content is:

    <p>The computer can&apos;t tell you the emotional story [...] </p>

Beautiful will parse this as as:

    <p>The computer can&amp;apost tell you the emotional story [...] </p>

which is not what we expect.

We fix this by replacing `&apos;` *before* HTML parsing by Beautiful.
Since test data in "test_timeline" got updated we also add an extra
assertion checking that part of the content with a literal "'" is
(still) properly rendered.
---
 tests/test_console.py | 3 ++-
 toot/output.py        | 4 ++--
 toot/utils.py         | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/tests/test_console.py b/tests/test_console.py
index 35f032b..3f28e04 100644
--- a/tests/test_console.py
+++ b/tests/test_console.py
@@ -126,7 +126,7 @@ def test_timeline(mock_get, monkeypatch, capsys):
             'username': 'fz'
         },
         'created_at': '2017-04-12T15:53:18.174Z',
-        'content': "<p>The computer can't tell you the emotional story. It can give you the exact mathematical design, but what's missing is the eyebrows.</p>",
+        'content': "<p>The computer can&apos;t tell you the emotional story. It can give you the exact mathematical design, but what's missing is the eyebrows.</p>",
         'reblog': None,
     }])
 
@@ -136,6 +136,7 @@ def test_timeline(mock_get, monkeypatch, capsys):
 
     out, err = capsys.readouterr()
     assert "The computer can't tell you the emotional story." in out
+    assert "but what's missing is the eyebrows." in out
     assert "Frank Zappa" in out
     assert "@fz" in out
 
diff --git a/toot/output.py b/toot/output.py
index 607fd95..36fec5f 100644
--- a/toot/output.py
+++ b/toot/output.py
@@ -148,8 +148,8 @@ def print_timeline(items):
         content = item['reblog']['content'] if item['reblog'] else item['content']
         reblogged = item['reblog']['account']['username'] if item['reblog'] else None
 
-        soup = BeautifulSoup(content, "html.parser")
-        text = soup.get_text().replace('&apos;', "'")
+        soup = BeautifulSoup(content.replace('&apos;', "'"), "html.parser")
+        text = soup.get_text()
         time = datetime.strptime(item['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ")
 
         return {
diff --git a/toot/utils.py b/toot/utils.py
index dc22bfb..1b590f3 100644
--- a/toot/utils.py
+++ b/toot/utils.py
@@ -11,7 +11,7 @@ from toot.exceptions import ConsoleError
 
 def get_text(html):
     """Converts html to text, strips all tags."""
-    text = BeautifulSoup(html, "html.parser").get_text().replace('&apos;', "'")
+    text = BeautifulSoup(html.replace('&apos;', "'"), "html.parser").get_text()
 
     return unicodedata.normalize('NFKC', text)