Correctly handle embeds inside inline styles/entities in richtext contentstate conversion (#7338)

Fixes #4602 as per https://github.com/wagtail/wagtail/issues/4602#issuecomment-479539444 (option 2). Previously, given HTML input such as: <p> <i>a bunch of text before <embed alt="somepic" embedtype="image" format="fullwidth" id="1"/> after</i> </p> the `<embed>` would start a new block, but the converter would keep hold of references to currently-open tags such as the `<i>`, so that when the corresponding `</i>` tag was encountered, it could match it up to the opening tag and fill in the 'length' field on the resulting InlineStyleRange object. However, since the span length is calculated based on the text content of the _current_ block (which is now "after"), it would obtain the wrong result - or, when there is no content between the embed and the closing tag (and thus no current block), would throw the exception `'NoneType' object has no attribute 'text'`. In this new approach, when the embed is encountered, the current block is closed _along with all of its styles and entities_, causing the lengths of those spans to be filled in correctly. After inserting the embed, the current block is then set to a replica of the previous block with all those styles and entities reopened, so that when the closing tag is finally encountered, the span length is correctly set based on the new 'after' block.
2019-04-03 13:58:10 +01:00 · 2019-04-03 13:58:10 +01:00 · 2bec229e12
commit 2bec229e12
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@ -37,6 +37,7 @@ Changelog
 * Fix: Ensure that `editor` and `features` arguments on RichTextField are preserved by `clone()` (Daniel Fairhead)
 * Fix: Rename 'spin' CSS animation to avoid clashes with other libraries (Kevin Gutiérrez)
 * Fix: Prevent crash when copying a page from a section where the user has no publish permission (Karl Hobley)
+ * Fix: Ensure that rich text conversion correctly handles images / embeds inside links or inline styles (Matt Westcott)


 2.13.4 (13.07.2021)
--- a/docs/releases/2.14.rst
+++ b/docs/releases/2.14.rst
@ -48,6 +48,7 @@ Bug fixes
 * Ensure that ``editor`` and ``features`` arguments on RichTextField are preserved by ``clone()`` (Daniel Fairhead)
 * Rename 'spin' CSS animation to avoid clashes with other libraries (Kevin Gutiérrez)
 * Prevent crash when copying a page from a section where the user has no publish permission (Karl Hobley)
+ * Ensure that rich text conversion correctly handles images / embeds inside links or inline styles (Matt Westcott)

 Upgrade considerations
 ======================
--- a/wagtail/admin/rich_text/converters/html_to_contentstate.py
+++ b/wagtail/admin/rich_text/converters/html_to_contentstate.py
@ -233,13 +233,42 @@ class AtomicBlockEntityElementHandler:
    Handler for elements like <img> that exist as a single immutable item at the block level
    """
    def handle_starttag(self, name, attrs, state, contentstate):
-        # forcibly close any block that illegally contains this one
-        state.current_block = None
+        if state.current_block:
+            # Placing an atomic block inside another block (e.g. a paragraph) is invalid in
+            # contentstate; we will recover from this by forcibly closing the block along with all
+            # of its inline styles / entities, and opening a new identical one afterwards.
+
+            # Construct a new block of the same type and depth as the currently open one; this will
+            # become the new 'current block' after we've added the atomic block.
+            next_block = Block(state.current_block.type, depth=state.current_block.depth)
+
+            for inline_style_range in state.current_inline_styles:
+                # set this inline style to end at the current text position
+                inline_style_range.length = len(state.current_block.text) - inline_style_range.offset
+                # start a new one of the same type, which will begin at the next block
+                new_inline_style = InlineStyleRange(inline_style_range.style)
+                new_inline_style.offset = 0
+                next_block.inline_style_ranges.append(new_inline_style)
+
+            for entity_range in state.current_entity_ranges:
+                # set this inline entity to end at the current text position
+                entity_range.length = len(state.current_block.text) - entity_range.offset
+                # start a new entity range, pointing to the same entity, to begin at the next block
+                new_entity_range = EntityRange(entity_range.key)
+                new_entity_range.offset = 0
+                next_block.entity_ranges.append(new_entity_range)
+
+            state.current_block = None
+        else:
+            next_block = None

        if not state.has_preceding_nonatomic_block:
            # if this block is NOT preceded by a non-atomic block,
            # need to insert a spacer paragraph
            add_paragraph_block(state, contentstate)
+            # immediately set this as not the current block, so that any subsequent invocations
+            # of this handler don't think we're inside it
+            state.current_block = None

        attr_dict = dict(attrs)  # convert attrs from list of (name, value) tuples to a dict
        entity = self.create_entity(name, attr_dict, state, contentstate)
@ -254,6 +283,18 @@ class AtomicBlockEntityElementHandler:
        block.entity_ranges.append(entity_range)
        state.has_preceding_nonatomic_block = False

+        if next_block:
+            # take the replica that we made of the previous block and its inline styles / entities,
+            # and make that the new current block. Now, when we encounter the closing tags for
+            # those styles/entities further on in the document, they will close the range that
+            # began here.
+            contentstate.blocks.append(next_block)
+            state.current_block = next_block
+            state.current_inline_styles = next_block.inline_style_ranges.copy()
+            state.current_entity_ranges = next_block.entity_ranges.copy()
+            state.has_preceding_nonatomic_block = True
+            state.leading_whitespace = STRIP_WHITESPACE
+
    def handle_endtag(self, name, state, contentstate):
        pass

--- a/wagtail/admin/tests/test_contentstate.py
+++ b/wagtail/admin/tests/test_contentstate.py
@ -38,7 +38,10 @@ class TestHtmlToContentState(TestCase):

    def assertContentStateEqual(self, v1, v2, match_keys=False):
        "Assert that two contentState structures are equal, ignoring 'key' properties if match_keys is False"
-        self.assertTrue(content_state_equal(v1, v2, match_keys=match_keys), "%r does not match %r" % (v1, v2))
+        self.assertTrue(
+            content_state_equal(v1, v2, match_keys=match_keys),
+            "%s does not match %s" % (json.dumps(v1, indent=4), json.dumps(v2, indent=4))
+        )

    def test_paragraphs(self):
        converter = ContentstateConverter(features=[])
@ -829,6 +832,95 @@ class TestHtmlToContentState(TestCase):
            'entityMap': {}
        })

+    def test_image_inside_paragraph(self):
+        # In Draftail's data model, images are block-level elements and therefore
+        # split up preceding / following text into their own paragraphs
+        converter = ContentstateConverter(features=['image'])
+        result = json.loads(converter.from_database_format(
+            '''
+            <p>before <embed embedtype="image" alt="an image" id="1" format="left" /> after</p>
+            '''
+        ))
+        self.assertContentStateEqual(result, {
+            'blocks': [
+                {'key': '00000', 'inlineStyleRanges': [], 'entityRanges': [], 'depth': 0, 'text': 'before', 'type': 'unstyled'},
+                {'key': '00000', 'inlineStyleRanges': [], 'entityRanges': [{'key': 0, 'offset': 0, 'length': 1}], 'depth': 0, 'text': ' ', 'type': 'atomic'},
+                {'key': '00000', 'inlineStyleRanges': [], 'entityRanges': [], 'depth': 0, 'text': 'after', 'type': 'unstyled'}
+            ],
+            'entityMap': {
+                '0': {
+                    'data': {'format': 'left', 'alt': 'an image', 'id': '1', 'src': '/media/not-found'},
+                    'mutability': 'IMMUTABLE', 'type': 'IMAGE'
+                }
+            }
+        })
+
+    def test_image_inside_style(self):
+        # https://github.com/wagtail/wagtail/issues/4602 - ensure that an <embed> inside
+        # an inline style is handled. This is not valid in Draftail as images are block-level,
+        # but should be handled without errors, splitting the image into its own block
+        converter = ContentstateConverter(features=['image', 'italic'])
+        result = json.loads(converter.from_database_format(
+            '''
+            <p><i>before <embed embedtype="image" alt="an image" id="1" format="left" /> after</i></p>
+            <p><i><embed embedtype="image" alt="an image" id="1" format="left" /></i></p>
+            '''
+        ))
+        self.assertContentStateEqual(result, {
+            'blocks': [
+                {'key': '00000', 'inlineStyleRanges': [{'offset': 0, 'length': 6, 'style': 'ITALIC'}], 'entityRanges': [], 'depth': 0, 'text': 'before', 'type': 'unstyled'},
+                {'key': '00000', 'inlineStyleRanges': [], 'entityRanges': [{'key': 0, 'offset': 0, 'length': 1}], 'depth': 0, 'text': ' ', 'type': 'atomic'},
+                {'key': '00000', 'inlineStyleRanges': [{'offset': 0, 'length': 5, 'style': 'ITALIC'}], 'entityRanges': [], 'depth': 0, 'text': 'after', 'type': 'unstyled'},
+                {'key': '00000', 'inlineStyleRanges': [{'offset': 0, 'length': 0, 'style': 'ITALIC'}], 'entityRanges': [], 'depth': 0, 'text': '', 'type': 'unstyled'},
+                {'key': '00000', 'inlineStyleRanges': [], 'entityRanges': [{'key': 1, 'offset': 0, 'length': 1}], 'depth': 0, 'text': ' ', 'type': 'atomic'},
+                {'key': '00000', 'inlineStyleRanges': [{'offset': 0, 'length': 0, 'style': 'ITALIC'}], 'entityRanges': [], 'depth': 0, 'text': '', 'type': 'unstyled'},
+            ],
+            'entityMap': {
+                '0': {
+                    'data': {'format': 'left', 'alt': 'an image', 'id': '1', 'src': '/media/not-found'},
+                    'mutability': 'IMMUTABLE', 'type': 'IMAGE'
+                },
+                '1': {
+                    'data': {'format': 'left', 'alt': 'an image', 'id': '1', 'src': '/media/not-found'},
+                    'mutability': 'IMMUTABLE', 'type': 'IMAGE'
+                },
+            }
+        })
+
+    def test_image_inside_link(self):
+        # https://github.com/wagtail/wagtail/issues/4602 - ensure that an <embed> inside
+        # a link is handled. This is not valid in Draftail as images are block-level,
+        # but should be handled without errors, splitting the image into its own block
+        converter = ContentstateConverter(features=['image', 'link'])
+        result = json.loads(converter.from_database_format(
+            '''
+            <p><a href="https://wagtail.io">before <embed embedtype="image" alt="an image" id="1" format="left" /> after</a></p>
+            <p><a href="https://wagtail.io"><embed embedtype="image" alt="an image" id="1" format="left" /></a></p>
+            '''
+        ))
+        self.assertContentStateEqual(result, {
+            'blocks': [
+                {'key': '00000', 'inlineStyleRanges': [], 'entityRanges': [{'key': 0, 'offset': 0, 'length': 6}], 'depth': 0, 'text': 'before', 'type': 'unstyled'},
+                {'key': '00000', 'inlineStyleRanges': [], 'entityRanges': [{'key': 1, 'offset': 0, 'length': 1}], 'depth': 0, 'text': ' ', 'type': 'atomic'},
+                {'key': '00000', 'inlineStyleRanges': [], 'entityRanges': [{'key': 0, 'offset': 0, 'length': 5}], 'depth': 0, 'text': 'after', 'type': 'unstyled'},
+                {'key': '00000', 'inlineStyleRanges': [], 'entityRanges': [{'key': 2, 'offset': 0, 'length': 0}], 'depth': 0, 'text': '', 'type': 'unstyled'},
+                {'key': '00000', 'inlineStyleRanges': [], 'entityRanges': [{'key': 3, 'offset': 0, 'length': 1}], 'depth': 0, 'text': ' ', 'type': 'atomic'},
+                {'key': '00000', 'inlineStyleRanges': [], 'entityRanges': [{'key': 2, 'offset': 0, 'length': 0}], 'depth': 0, 'text': '', 'type': 'unstyled'},
+            ],
+            'entityMap': {
+                '0': {'mutability': 'MUTABLE', 'type': 'LINK', 'data': {'url': 'https://wagtail.io'}},
+                '1': {
+                    'data': {'format': 'left', 'alt': 'an image', 'id': '1', 'src': '/media/not-found'},
+                    'mutability': 'IMMUTABLE', 'type': 'IMAGE'
+                },
+                '2': {'mutability': 'MUTABLE', 'type': 'LINK', 'data': {'url': 'https://wagtail.io'}},
+                '3': {
+                    'data': {'format': 'left', 'alt': 'an image', 'id': '1', 'src': '/media/not-found'},
+                    'mutability': 'IMMUTABLE', 'type': 'IMAGE'
+                },
+            }
+        })
+

 class TestContentStateToHtml(TestCase):
    def test_external_link(self):