From a0f77e180024ad8e0275034416088a7513dca067 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Sun, 4 Aug 2019 10:11:59 -0400 Subject: [PATCH 1/4] Improve HTML::toPlaintext - Ignore empty trimmed text nodes - Ignore anchor links - Ignore blank tags and avoids adding a doctype to transitional DOM objects --- src/Content/Text/HTML.php | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/Content/Text/HTML.php b/src/Content/Text/HTML.php index 4be217b3e..b9132c5d4 100644 --- a/src/Content/Text/HTML.php +++ b/src/Content/Text/HTML.php @@ -56,6 +56,7 @@ class HTML $xpath = new DOMXPath($doc); + /** @var \DOMNode[] $list */ $list = $xpath->query("//" . $tag); foreach ($list as $node) { $attr = []; @@ -98,9 +99,12 @@ class HTML $node->parentNode->insertBefore($StartCode, $node); if ($node->hasChildNodes()) { + /** @var \DOMNode $child */ foreach ($node->childNodes as $child) { - $newNode = $child->cloneNode(true); - $node->parentNode->insertBefore($newNode, $node); + if (trim($child->nodeValue)) { + $newNode = $child->cloneNode(true); + $node->parentNode->insertBefore($newNode, $node); + } } } @@ -559,6 +563,8 @@ class HTML $ignore = false; } + $ignore = $ignore || strpos($treffer[1], '#') === 0; + if (!$ignore) { $urls[$treffer[1]] = $treffer[1]; } @@ -582,7 +588,7 @@ class HTML $message = mb_convert_encoding($message, 'HTML-ENTITIES', "UTF-8"); - @$doc->loadHTML($message); + @$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS); $message = $doc->saveHTML(); // Remove eventual UTF-8 BOM @@ -591,7 +597,7 @@ class HTML // Collecting all links $urls = self::collectURLs($message); - @$doc->loadHTML($message); + @$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS); self::tagToBBCode($doc, 'html', [], '', ''); self::tagToBBCode($doc, 'body', [], '', ''); From c3e3e83a521b971cc90f17257bf2c4b793f26c68 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Sun, 4 Aug 2019 10:22:23 -0400 Subject: [PATCH 2/4] Improve BBCode:toPlaintext - Fix issue where matching literal square brackets were removed with their content - Fix issue where content without line feeds between BBCode tags would end up compacted in plain text - Update extr calls to BBCode::toPlaintext in api --- include/api.php | 4 ++-- src/Content/Text/BBCode.php | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/include/api.php b/include/api.php index 543c01827..a7d25c83f 100644 --- a/include/api.php +++ b/include/api.php @@ -614,7 +614,7 @@ function api_get_user(App $a, $contact_id = null) 'name' => $contact["name"], 'screen_name' => (($contact['nick']) ? $contact['nick'] : $contact['name']), 'location' => ($contact["location"] != "") ? $contact["location"] : ContactSelector::networkToName($contact['network'], $contact['url']), - 'description' => HTML::toPlaintext(BBCode::toPlaintext($contact["about"])), + 'description' => BBCode::toPlaintext($contact["about"]), 'profile_image_url' => $contact["micro"], 'profile_image_url_https' => $contact["micro"], 'profile_image_url_profile_size' => $contact["thumb"], @@ -693,7 +693,7 @@ function api_get_user(App $a, $contact_id = null) 'name' => (($uinfo[0]['name']) ? $uinfo[0]['name'] : $uinfo[0]['nick']), 'screen_name' => (($uinfo[0]['nick']) ? $uinfo[0]['nick'] : $uinfo[0]['name']), 'location' => $location, - 'description' => HTML::toPlaintext(BBCode::toPlaintext($description)), + 'description' => BBCode::toPlaintext($description), 'profile_image_url' => $uinfo[0]['micro'], 'profile_image_url_https' => $uinfo[0]['micro'], 'profile_image_url_profile_size' => $uinfo[0]["thumb"], diff --git a/src/Content/Text/BBCode.php b/src/Content/Text/BBCode.php index b2d4ebb5d..b012e79fb 100644 --- a/src/Content/Text/BBCode.php +++ b/src/Content/Text/BBCode.php @@ -357,10 +357,7 @@ class BBCode extends BaseObject */ public static function toPlaintext($text, $keep_urls = true) { - $naked_text = preg_replace('/\[.+?\]/','', $text); - if (!$keep_urls) { - $naked_text = preg_replace('#https?\://[^\s<]+[^\s\.\)]#i', '', $naked_text); - } + $naked_text = HTML::toPlaintext(BBCode::convert($text, false, 0, true), 0, !$keep_urls); return $naked_text; } From 4a85de4c199a9ea32d8cb14f08b2b3729eb655ad Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Sun, 4 Aug 2019 10:22:49 -0400 Subject: [PATCH 3/4] Add HTML output panel to babel module --- src/Module/Debug/Babel.php | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Module/Debug/Babel.php b/src/Module/Debug/Babel.php index be10da7ea..b9b629f07 100644 --- a/src/Module/Debug/Babel.php +++ b/src/Module/Debug/Babel.php @@ -148,6 +148,12 @@ class Babel extends BaseModule 'content' => htmlspecialchars($html2) ]; + $bbcode2plain = Text\BBCode::toPlaintext($bbcode); + $results[] = [ + 'title' => L10n::t('HTML::toBBCode => BBCode::toPlaintext'), + 'content' => '
' . $bbcode2plain . '
' + ]; + $markdown = Text\HTML::toMarkdown($html); $results[] = [ 'title' => L10n::t('HTML::toMarkdown'), @@ -162,7 +168,7 @@ class Babel extends BaseModule $text = Text\HTML::toPlaintext($html, 0, true); $results[] = [ - 'title' => L10n::t('HTML::toPlaintext'), + 'title' => L10n::t('HTML::toPlaintext (compact)'), 'content' => '
' . $text . '
' ]; } From 515935b241ee11afeea60de6a4954778ed41e6c3 Mon Sep 17 00:00:00 2001 From: Hypolite Petovan Date: Sun, 4 Aug 2019 10:24:25 -0400 Subject: [PATCH 4/4] Add test case for this bug - Fix whitespaces in MarkdownTest --- .../datasets/content/text/html/bug-7457.html | 1 + tests/datasets/content/text/html/bug-7457.txt | 5 + tests/src/Content/Text/HTMLTest.php | 53 +++++++++ tests/src/Content/Text/MarkdownTest.php | 104 +++++++++--------- 4 files changed, 111 insertions(+), 52 deletions(-) create mode 100644 tests/datasets/content/text/html/bug-7457.html create mode 100644 tests/datasets/content/text/html/bug-7457.txt create mode 100644 tests/src/Content/Text/HTMLTest.php diff --git a/tests/datasets/content/text/html/bug-7457.html b/tests/datasets/content/text/html/bug-7457.html new file mode 100644 index 000000000..4a2d4b33c --- /dev/null +++ b/tests/datasets/content/text/html/bug-7457.html @@ -0,0 +1 @@ +

[1.0.4] - 2019-08-01

Fixed

  • Invalid SemVer version generation, when the current branch does not have commits ahead of tag/checked out on a tag
\ No newline at end of file diff --git a/tests/datasets/content/text/html/bug-7457.txt b/tests/datasets/content/text/html/bug-7457.txt new file mode 100644 index 000000000..051071d55 --- /dev/null +++ b/tests/datasets/content/text/html/bug-7457.txt @@ -0,0 +1,5 @@ +*[1.0.4] - 2019-08-01* + +*Fixed* + +* Invalid SemVer version generation, when the current branch does not have commits ahead of tag/checked out on a tag \ No newline at end of file diff --git a/tests/src/Content/Text/HTMLTest.php b/tests/src/Content/Text/HTMLTest.php new file mode 100644 index 000000000..65ae05249 --- /dev/null +++ b/tests/src/Content/Text/HTMLTest.php @@ -0,0 +1,53 @@ +setUpVfsDir(); + $this->mockApp($this->root); + } + + public function dataHTML() + { + $inputFiles = glob(__DIR__ . '/../../../datasets/content/text/html/*.html'); + + $data = []; + + foreach ($inputFiles as $file) { + $data[str_replace('.html', '', $file)] = [ + 'input' => file_get_contents($file), + 'expected' => file_get_contents(str_replace('.html', '.txt', $file)) + ]; + } + + return $data; + } + + /** + * Test convert different input Markdown text into HTML + * + * @dataProvider dataHTML + * + * @param string $input The Markdown text to test + * @param string $expected The expected HTML output + * @throws \Exception + */ + public function testToPlaintext($input, $expected) + { + $output = HTML::toPlaintext($input, 0); + + $this->assertEquals($expected, $output); + } +} diff --git a/tests/src/Content/Text/MarkdownTest.php b/tests/src/Content/Text/MarkdownTest.php index e39b46b2c..80421b522 100644 --- a/tests/src/Content/Text/MarkdownTest.php +++ b/tests/src/Content/Text/MarkdownTest.php @@ -1,52 +1,52 @@ -setUpVfsDir(); - $this->mockApp($this->root); - } - - public function dataMarkdown() - { - $inputFiles = glob(__DIR__ . '/../../../datasets/content/text/markdown/*.md'); - - $data = []; - - foreach ($inputFiles as $file) { - $data[str_replace('.md', '', $file)] = [ - 'input' => file_get_contents($file), - 'expected' => file_get_contents(str_replace('.md', '.html', $file)) - ]; - } - - return $data; - } - - /** - * Test convert different input Markdown text into HTML - * @dataProvider dataMarkdown - * - * @param string $input The Markdown text to test - * @param string $expected The expected HTML output - * @throws \Exception - */ - public function testConvert($input, $expected) - { - $output = Markdown::convert($input); - - $this->assertEquals($expected, $output); - } -} \ No newline at end of file +setUpVfsDir(); + $this->mockApp($this->root); + } + + public function dataMarkdown() + { + $inputFiles = glob(__DIR__ . '/../../../datasets/content/text/markdown/*.md'); + + $data = []; + + foreach ($inputFiles as $file) { + $data[str_replace('.md', '', $file)] = [ + 'input' => file_get_contents($file), + 'expected' => file_get_contents(str_replace('.md', '.html', $file)) + ]; + } + + return $data; + } + + /** + * Test convert different input Markdown text into HTML + * @dataProvider dataMarkdown + * + * @param string $input The Markdown text to test + * @param string $expected The expected HTML output + * @throws \Exception + */ + public function testConvert($input, $expected) + { + $output = Markdown::convert($input); + + $this->assertEquals($expected, $output); + } +}