Use standard tokenizer for Elasticsearch to preserve numeric tokens (#12851)

Since its inception the Elasticsearch backend has defaulted to the `lowercase` tokenizer (https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-tokenizers.html#_word_oriented_tokenizers), which treats non-letter characters as separators. This means that numbers within text (surrounded by whitespace) are skipped when indexing and cannot be searched. Change to the `standard` tokenizer, but apply the `lowercase` filter to keep searches case-insensitive.
2025-02-06 19:03:26 +00:00 · 2025-02-06 19:03:26 +00:00 · 225d3f384c
commit 225d3f384c
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@ -8,6 +8,7 @@ Changelog
 * Implement `normalize` on `TypedTableBlock` to assist with setting `default` and `preview_value` (Sage Abdullah)
 * Apply normalization when modifying a `StreamBlock`'s value to assist with programmatic changes to `StreamField` (Matt Westcott)
 * Allow a custom image rendition model to define its unique constraint with `models.UniqueConstraint` instead of `unique_together` (Oliver Parker, Cynthia Kiser, Sage Abdullah)
+ * Default to the `standard` tokenizer on Elasticsearch, to correctly handle numbers as tokens (Matt Westcott)
 * Fix: Take preferred language into account for translatable strings in client-side code (Bernhard Bliem, Sage Abdullah)
 * Docs: Add missing `django.contrib.admin` to list of apps in "add to Django project" guide (Mohamed Rabiaa)

--- a/docs/releases/6.5.md
+++ b/docs/releases/6.5.md
@ -17,6 +17,7 @@ depth: 1
 * Implement `normalize` on `TypedTableBlock` to assist with setting `default` and `preview_value` (Sage Abdullah)
 * Apply normalization when modifying a `StreamBlock`'s value to assist with programmatic changes to `StreamField` (Matt Westcott)
 * Allow a custom image rendition model to define its unique constraint with `models.UniqueConstraint` instead of `unique_together` (Oliver Parker, Cynthia Kiser, Sage Abdullah)
+ * Default to the `standard` tokenizer on Elasticsearch, to correctly handle numbers as tokens (Matt Westcott)

 ### Bug fixes

--- a/wagtail/search/backends/elasticsearch7.py
+++ b/wagtail/search/backends/elasticsearch7.py
@ -1141,13 +1141,13 @@ class Elasticsearch7SearchBackend(BaseSearchBackend):
                "analyzer": {
                    "ngram_analyzer": {
                        "type": "custom",
-                        "tokenizer": "lowercase",
-                        "filter": ["asciifolding", "ngram"],
+                        "tokenizer": "standard",
+                        "filter": ["asciifolding", "lowercase", "ngram"],
                    },
                    "edgengram_analyzer": {
                        "type": "custom",
-                        "tokenizer": "lowercase",
-                        "filter": ["asciifolding", "edgengram"],
+                        "tokenizer": "standard",
+                        "filter": ["asciifolding", "lowercase", "edgengram"],
                    },
                },
                "tokenizer": {
--- a/wagtail/search/tests/elasticsearch_common_tests.py
+++ b/wagtail/search/tests/elasticsearch_common_tests.py
@ -114,6 +114,49 @@ class ElasticsearchCommonSearchBackendTests(BackendTests):
            ],
        )

+    def test_search_with_numeric_term(self):
+        book = models.Book.objects.create(
+            title="Harry Potter and the 31337 Goblets of Fire",
+            publication_date=date(2009, 7, 15),
+            number_of_pages=607,
+        )
+
+        index = self.backend.get_index_for_model(models.Book)
+        index.add_item(book)
+        index.refresh()
+
+        results = self.backend.search("31337", models.Book)
+        self.assertUnsortedListEqual(
+            [r.title for r in results],
+            [
+                "Harry Potter and the 31337 Goblets of Fire",
+            ],
+        )
+
+        results = self.backend.autocomplete("313", models.Book)
+        self.assertUnsortedListEqual(
+            [r.title for r in results],
+            [
+                "Harry Potter and the 31337 Goblets of Fire",
+            ],
+        )
+
+        results = self.backend.search("31337 goblets", models.Book)
+        self.assertUnsortedListEqual(
+            [r.title for r in results],
+            [
+                "Harry Potter and the 31337 Goblets of Fire",
+            ],
+        )
+
+        results = self.backend.autocomplete("31337 gob", models.Book)
+        self.assertUnsortedListEqual(
+            [r.title for r in results],
+            [
+                "Harry Potter and the 31337 Goblets of Fire",
+            ],
+        )
+
    def test_and_operator_with_single_field(self):
        # Testing for bug #1859
        results = self.backend.search(