From d1222208081c6608d7601297db0cbffc28ae5dee Mon Sep 17 00:00:00 2001 From: JensDiemer Date: Sat, 9 Oct 2021 15:39:37 +0200 Subject: [PATCH] Better tracebacks on html validation errors See also: https://github.com/boxine/bx_py_utils/pull/97 --- src/inventory_project/tests/temp_utils.py | 157 ++++++++++++++++++++-- 1 file changed, 146 insertions(+), 11 deletions(-) diff --git a/src/inventory_project/tests/temp_utils.py b/src/inventory_project/tests/temp_utils.py index ce4cb59..c9965eb 100644 --- a/src/inventory_project/tests/temp_utils.py +++ b/src/inventory_project/tests/temp_utils.py @@ -1,31 +1,166 @@ """ Remove this if https://github.com/boxine/bx_py_utils/pull/95 merged! """ +import inspect from pathlib import Path import pytest from bs4 import BeautifulSoup from bx_py_utils.test_utils.snapshot import assert_text_snapshot from django.http import HttpResponse -from lxml import html +from lxml import etree +from lxml.etree import XMLSyntaxError -def validate_html(data): - parser = html.HTMLParser( - recover=False, # Crash faster on broken HTML - ) - parser.feed(data) - parser.close() +def cutout(text, line_no, column, extra_lines=2): + assert isinstance(text, str) + assert line_no >= 0 + assert column >= 0 + assert extra_lines >= 0 + + lines = text.splitlines() + line_count = len(lines) + + assert line_no <= line_count + + from_line = line_no - extra_lines - 1 + if from_line < 0: + from_line = 0 + + to_line = line_no + extra_lines + if to_line > line_count: + to_line = line_count + + line_no_width = len(str(from_line)) + 1 + + lines = lines[from_line: to_line] + result = [] + for no, line in enumerate(lines, from_line + 1): + result.append( + f'{no:0{line_no_width}} {line}' + ) + if no == line_no: + result.append( + f'{"-"*(line_no_width+column+1)}^' + ) + + return '\n'.join(result) + + +def test_cutout(): + text = inspect.cleandoc(''' + line 1 + line 2 + 01234567890 line 3 + line 4 + line 5 + ''') + + output = cutout(text, line_no=3, column=5, extra_lines=1) + assert output == inspect.cleandoc(''' + 02 line 2 + 03 01234567890 line 3 + --------^ + 04 line 4 + ''') + + output = cutout(text, line_no=3, column=1, extra_lines=0) + assert output == inspect.cleandoc(''' + 03 01234567890 line 3 + ----^ + ''') + + output = cutout(text, line_no=3, column=10, extra_lines=2) + assert output == inspect.cleandoc(''' + 01 line 1 + 02 line 2 + 03 01234567890 line 3 + -------------^ + 04 line 4 + 05 line 5 + ''') + + text = '\n'.join(f'The Line {no}' for no in range(20)) + output = cutout(text, line_no=18, column=9, extra_lines=2) + assert output == inspect.cleandoc(''' + 016 The Line 15 + 017 The Line 16 + 018 The Line 17 + -------------^ + 019 The Line 18 + 020 The Line 19 + ''') + + +class InvalidHtml(AssertionError): + """ + XMLSyntaxError with better error messages: used in validate_html() + """ + + def __init__(self, *args): + self.args = args + + data, origin_err = args + assert isinstance(data, str) + assert isinstance(origin_err, XMLSyntaxError) + + self.origin_msg = origin_err.msg + + line_no, column = origin_err.position + self.cutoput_text = cutout(data, line_no, column, extra_lines=3) + + def __str__(self): + return ( + f'{self.origin_msg}\n' + f'{"-"*80}\n' + f'{self.cutoput_text}\n' + f'{"-"*80}' + ) + + +def validate_html(data, **parser_kwargs): + parser = etree.XMLParser(**parser_kwargs) + try: + parser.feed(data) + parser.close() + except XMLSyntaxError as err: + raise InvalidHtml(data, err) def test_validate_html(): validate_html('

Test

') validate_html('') - from lxml.etree import XMLSyntaxError - with pytest.raises(XMLSyntaxError) as exc_info: - validate_html('

>broken<

') - assert exc_info.value.args[0] == 'htmlParseStartTag: invalid element name, line 1, column 13' + validate_html('') + validate_html('') + validate_html('') + + with pytest.raises(InvalidHtml) as exc_info: + validate_html(inspect.cleandoc(''' + + + +

Test

+

>broken<

+

the end

+ +
+
+ ''')) + error_message = str(exc_info.value) + assert error_message == inspect.cleandoc(''' + StartTag: invalid element name, line 5, column 25 + -------------------------------------------------------------------------------- + 02 + 03 + 04

Test

+ 05

>broken<

+ ----------------------------^ + 06

the end

+ 07 + 08
+ -------------------------------------------------------------------------------- + ''') def pretty_format_html(data):