From 0f6bd920c34f43f99327b8d3a4b3c9a3df2d1879 Mon Sep 17 00:00:00 2001 From: Denis Laxalde Date: Tue, 1 Jan 2019 22:55:49 +0100 Subject: [PATCH] Replace ' by "'" before parsing HTML Beautiful will does not parse HTML entities like `'` as we expect and the previous logic of replacing this *after* HTML parsing occurred did not produced expected results. To illustrate this, we change data in "test_timeline" to include a literal `'` as it sometimes occur in data returned by Mastodon API. New HTML content is:

The computer can't tell you the emotional story [...]

Beautiful will parse this as as:

The computer can&apost tell you the emotional story [...]

which is not what we expect. We fix this by replacing `'` *before* HTML parsing by Beautiful. Since test data in "test_timeline" got updated we also add an extra assertion checking that part of the content with a literal "'" is (still) properly rendered. --- tests/test_console.py | 3 ++- toot/output.py | 4 ++-- toot/utils.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_console.py b/tests/test_console.py index 35f032b..3f28e04 100644 --- a/tests/test_console.py +++ b/tests/test_console.py @@ -126,7 +126,7 @@ def test_timeline(mock_get, monkeypatch, capsys): 'username': 'fz' }, 'created_at': '2017-04-12T15:53:18.174Z', - 'content': "

The computer can't tell you the emotional story. It can give you the exact mathematical design, but what's missing is the eyebrows.

", + 'content': "

The computer can't tell you the emotional story. It can give you the exact mathematical design, but what's missing is the eyebrows.

", 'reblog': None, }]) @@ -136,6 +136,7 @@ def test_timeline(mock_get, monkeypatch, capsys): out, err = capsys.readouterr() assert "The computer can't tell you the emotional story." in out + assert "but what's missing is the eyebrows." in out assert "Frank Zappa" in out assert "@fz" in out diff --git a/toot/output.py b/toot/output.py index 607fd95..36fec5f 100644 --- a/toot/output.py +++ b/toot/output.py @@ -148,8 +148,8 @@ def print_timeline(items): content = item['reblog']['content'] if item['reblog'] else item['content'] reblogged = item['reblog']['account']['username'] if item['reblog'] else None - soup = BeautifulSoup(content, "html.parser") - text = soup.get_text().replace(''', "'") + soup = BeautifulSoup(content.replace(''', "'"), "html.parser") + text = soup.get_text() time = datetime.strptime(item['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ") return { diff --git a/toot/utils.py b/toot/utils.py index dc22bfb..1b590f3 100644 --- a/toot/utils.py +++ b/toot/utils.py @@ -11,7 +11,7 @@ from toot.exceptions import ConsoleError def get_text(html): """Converts html to text, strips all tags.""" - text = BeautifulSoup(html, "html.parser").get_text().replace(''', "'") + text = BeautifulSoup(html.replace(''', "'"), "html.parser").get_text() return unicodedata.normalize('NFKC', text)