Ensure block level elements in rich text are whitespace-separated in indexed content

Fixes #6312
pull/6361/head
Matt Westcott 2020-08-18 11:17:52 +01:00 zatwierdzone przez Matt Westcott
rodzic e3bbccd814
commit 68febd981d
5 zmienionych plików z 39 dodań i 10 usunięć

Wyświetl plik

@ -1,5 +1,4 @@
import datetime
from html import unescape
from django import forms
from django.db.models.fields import BLANK_CHOICE_DASH
@ -8,10 +7,10 @@ from django.template.loader import render_to_string
from django.utils.dateparse import parse_date, parse_datetime, parse_time
from django.utils.encoding import force_str
from django.utils.functional import cached_property
from django.utils.html import format_html, strip_tags
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from wagtail.core.rich_text import RichText
from wagtail.core.rich_text import RichText, get_text_for_indexing
from wagtail.core.utils import resolve_model_string
from .base import Block
@ -573,9 +572,9 @@ class RichTextBlock(FieldBlock):
return RichText(value)
def get_searchable_content(self, value):
# Strip HTML tags to prevent search backend earch backend from indexing them
# Strip HTML tags to prevent search backend from indexing them
source = force_str(value.source)
return [unescape(strip_tags(source))]
return [get_text_for_indexing(source)]
class Meta:
icon = "doc-full"

Wyświetl plik

@ -1,12 +1,11 @@
import json
from html import unescape
from django.core.serializers.json import DjangoJSONEncoder
from django.db import models
from django.utils.encoding import force_str
from django.utils.html import strip_tags
from wagtail.core.blocks import Block, BlockField, StreamBlock, StreamValue
from wagtail.core.rich_text import get_text_for_indexing
class RichTextField(models.TextField):
@ -23,9 +22,9 @@ class RichTextField(models.TextField):
return super().formfield(**defaults)
def get_searchable_content(self, value):
# Strip HTML tags to prevent search backend earch backend from indexing them
# Strip HTML tags to prevent search backend from indexing them
source = force_str(value)
return [unescape(strip_tags(source))]
return [get_text_for_indexing(source)]
# https://github.com/django/django/blob/64200c14e0072ba0ffef86da46b2ea82fd1e019a/django/db/models/fields/subclassing.py#L31-L44

Wyświetl plik

@ -1,5 +1,9 @@
import re
from html import unescape
from django.db.models import Model
from django.template.loader import render_to_string
from django.utils.html import strip_tags
from django.utils.safestring import mark_safe
from wagtail.core.rich_text.feature_registry import FeatureRegistry
@ -33,6 +37,17 @@ def expand_db_html(html):
return FRONTEND_REWRITER(html)
def get_text_for_indexing(richtext):
"""
Return a plain text version of a rich text string, suitable for search indexing;
like Django's strip_tags, but ensures that whitespace is left between block elements
so that <p>hello</p><p>world</p> gives "hello world", not "helloworld".
"""
# insert space after </p>, </h1> - </h6>, </li> and </blockquote> tags
richtext = re.sub(r'(</(p|h\d|li|blockquote)>)', r'\1 ', richtext, flags=re.IGNORECASE)
return unescape(strip_tags(richtext).strip())
class RichText:
"""
A custom object used to represent a renderable rich text value.

Wyświetl plik

@ -558,11 +558,17 @@ class TestRichTextBlock(TestCase):
result = block.get_searchable_content(value)
self.assertEqual(
result, [
'Merry Christmas! & a happy new year\n'
'Merry Christmas! & a happy new year \n'
'Our Santa pet Wagtail has some cool stuff in store for you all!'
]
)
def test_get_searchable_content_whitespace(self):
block = blocks.RichTextBlock()
value = RichText('<p>mashed</p><p>po<i>ta</i>toes</p>')
result = block.get_searchable_content(value)
self.assertEqual(result, ['mashed potatoes'])
class TestChoiceBlock(WagtailTestUtils, SimpleTestCase):
def setUp(self):

Wyświetl plik

@ -173,3 +173,13 @@ class TestRichTextField(TestCase):
value = body_field.value_from_object(christmas_page)
result = body_field.get_searchable_content(value)
self.assertEqual(result, ['Merry Christmas from Wagtail! & co.'])
def test_get_searchable_content_whitespace(self):
christmas_page = EventPage.objects.get(url_path='/home/events/christmas/')
christmas_page.body = '<p>mashed</p><p>po<i>ta</i>toes</p>'
christmas_page.save_revision(submitted_for_moderation=False)
body_field = christmas_page._meta.get_field('body')
value = body_field.value_from_object(christmas_page)
result = body_field.get_searchable_content(value)
self.assertEqual(result, ['mashed potatoes'])