diff --git a/wagtail/core/blocks/field_block.py b/wagtail/core/blocks/field_block.py index 294531b4da..661df83183 100644 --- a/wagtail/core/blocks/field_block.py +++ b/wagtail/core/blocks/field_block.py @@ -1,5 +1,4 @@ import datetime -from html import unescape from django import forms from django.db.models.fields import BLANK_CHOICE_DASH @@ -8,10 +7,10 @@ from django.template.loader import render_to_string from django.utils.dateparse import parse_date, parse_datetime, parse_time from django.utils.encoding import force_str from django.utils.functional import cached_property -from django.utils.html import format_html, strip_tags +from django.utils.html import format_html from django.utils.safestring import mark_safe -from wagtail.core.rich_text import RichText +from wagtail.core.rich_text import RichText, get_text_for_indexing from wagtail.core.utils import resolve_model_string from .base import Block @@ -573,9 +572,9 @@ class RichTextBlock(FieldBlock): return RichText(value) def get_searchable_content(self, value): - # Strip HTML tags to prevent search backend earch backend from indexing them + # Strip HTML tags to prevent search backend from indexing them source = force_str(value.source) - return [unescape(strip_tags(source))] + return [get_text_for_indexing(source)] class Meta: icon = "doc-full" diff --git a/wagtail/core/fields.py b/wagtail/core/fields.py index a8b4b7fc26..dfcc17e94e 100644 --- a/wagtail/core/fields.py +++ b/wagtail/core/fields.py @@ -1,12 +1,11 @@ import json -from html import unescape from django.core.serializers.json import DjangoJSONEncoder from django.db import models from django.utils.encoding import force_str -from django.utils.html import strip_tags from wagtail.core.blocks import Block, BlockField, StreamBlock, StreamValue +from wagtail.core.rich_text import get_text_for_indexing class RichTextField(models.TextField): @@ -23,9 +22,9 @@ class RichTextField(models.TextField): return super().formfield(**defaults) def get_searchable_content(self, value): - # Strip HTML tags to prevent search backend earch backend from indexing them + # Strip HTML tags to prevent search backend from indexing them source = force_str(value) - return [unescape(strip_tags(source))] + return [get_text_for_indexing(source)] # https://github.com/django/django/blob/64200c14e0072ba0ffef86da46b2ea82fd1e019a/django/db/models/fields/subclassing.py#L31-L44 diff --git a/wagtail/core/rich_text/__init__.py b/wagtail/core/rich_text/__init__.py index e90b37d333..63104f5778 100644 --- a/wagtail/core/rich_text/__init__.py +++ b/wagtail/core/rich_text/__init__.py @@ -1,5 +1,9 @@ +import re +from html import unescape + from django.db.models import Model from django.template.loader import render_to_string +from django.utils.html import strip_tags from django.utils.safestring import mark_safe from wagtail.core.rich_text.feature_registry import FeatureRegistry @@ -33,6 +37,17 @@ def expand_db_html(html): return FRONTEND_REWRITER(html) +def get_text_for_indexing(richtext): + """ + Return a plain text version of a rich text string, suitable for search indexing; + like Django's strip_tags, but ensures that whitespace is left between block elements + so that

hello

world

gives "hello world", not "helloworld". + """ + # insert space after

, - , and tags + richtext = re.sub(r'()', r'\1 ', richtext, flags=re.IGNORECASE) + return unescape(strip_tags(richtext).strip()) + + class RichText: """ A custom object used to represent a renderable rich text value. diff --git a/wagtail/core/tests/test_blocks.py b/wagtail/core/tests/test_blocks.py index dba1062198..3d2cab35b0 100644 --- a/wagtail/core/tests/test_blocks.py +++ b/wagtail/core/tests/test_blocks.py @@ -558,11 +558,17 @@ class TestRichTextBlock(TestCase): result = block.get_searchable_content(value) self.assertEqual( result, [ - 'Merry Christmas! & a happy new year\n' + 'Merry Christmas! & a happy new year \n' 'Our Santa pet Wagtail has some cool stuff in store for you all!' ] ) + def test_get_searchable_content_whitespace(self): + block = blocks.RichTextBlock() + value = RichText('

mashed

potatoes

') + result = block.get_searchable_content(value) + self.assertEqual(result, ['mashed potatoes']) + class TestChoiceBlock(WagtailTestUtils, SimpleTestCase): def setUp(self): diff --git a/wagtail/core/tests/test_rich_text.py b/wagtail/core/tests/test_rich_text.py index ef0f1162e5..8376cafdf3 100644 --- a/wagtail/core/tests/test_rich_text.py +++ b/wagtail/core/tests/test_rich_text.py @@ -173,3 +173,13 @@ class TestRichTextField(TestCase): value = body_field.value_from_object(christmas_page) result = body_field.get_searchable_content(value) self.assertEqual(result, ['Merry Christmas from Wagtail! & co.']) + + def test_get_searchable_content_whitespace(self): + christmas_page = EventPage.objects.get(url_path='/home/events/christmas/') + christmas_page.body = '

mashed

potatoes

' + christmas_page.save_revision(submitted_for_moderation=False) + + body_field = christmas_page._meta.get_field('body') + value = body_field.value_from_object(christmas_page) + result = body_field.get_searchable_content(value) + self.assertEqual(result, ['mashed potatoes'])