From e62d67bd730a1358124b821d291c272f9b955797 Mon Sep 17 00:00:00 2001 From: Bertrand Bordage Date: Wed, 12 Apr 2017 17:16:16 +0200 Subject: [PATCH] Adds PostgreSQL search backend. (#3515) * Adds PostgreSQL search backend. * Isort nitpicks. * Fixes PostgreSQL versions incompatibilities. * Uses Django lru_cache instead of building our own. * Fixes PostgreSQL search index on some empty vector & query cases. * Never sets the PostgreSQL search vector to NULL. * Simplification + removes caching on two fast enough functions. * Rewrites stale entries deletion to use the ORM. --- docs/reference/contrib/index.rst | 1 + docs/reference/contrib/postgres_search.rst | 131 +++++++ docs/topics/search/backends.rst | 10 + docs/topics/search/searching.rst | 2 + wagtail/contrib/postgres_search/__init__.py | 1 + wagtail/contrib/postgres_search/apps.py | 29 ++ wagtail/contrib/postgres_search/backend.py | 367 ++++++++++++++++++ .../migrations/0001_initial.py | 48 +++ .../postgres_search/migrations/__init__.py | 0 wagtail/contrib/postgres_search/models.py | 71 ++++ .../contrib/postgres_search/tests/__init__.py | 0 .../postgres_search/tests/test_backend.py | 42 ++ wagtail/contrib/postgres_search/utils.py | 118 ++++++ wagtail/tests/settings.py | 9 + 14 files changed, 829 insertions(+) create mode 100644 docs/reference/contrib/postgres_search.rst create mode 100644 wagtail/contrib/postgres_search/__init__.py create mode 100644 wagtail/contrib/postgres_search/apps.py create mode 100644 wagtail/contrib/postgres_search/backend.py create mode 100644 wagtail/contrib/postgres_search/migrations/0001_initial.py create mode 100644 wagtail/contrib/postgres_search/migrations/__init__.py create mode 100644 wagtail/contrib/postgres_search/models.py create mode 100644 wagtail/contrib/postgres_search/tests/__init__.py create mode 100644 wagtail/contrib/postgres_search/tests/test_backend.py create mode 100644 wagtail/contrib/postgres_search/utils.py diff --git a/docs/reference/contrib/index.rst b/docs/reference/contrib/index.rst index bac4346839..98703861a2 100644 --- a/docs/reference/contrib/index.rst +++ b/docs/reference/contrib/index.rst @@ -14,6 +14,7 @@ Wagtail ships with a variety of extra optional modules. routablepage api/index modeladmin/index + postgres_search searchpromotions table_block diff --git a/docs/reference/contrib/postgres_search.rst b/docs/reference/contrib/postgres_search.rst new file mode 100644 index 0000000000..47fe10e8ec --- /dev/null +++ b/docs/reference/contrib/postgres_search.rst @@ -0,0 +1,131 @@ +.. _postgres_search: + +======================== +PostgreSQL search engine +======================== + +This contrib module provides a search engine backend for Wagtail using +`PostgreSQL full-text search capabilities `_. + +.. warning:: + + | You need to use Django 1.10 or more to be able to use this backend. + | You can only use this module to index data from a PostgreSQL database. + +**Features**: + +- Supports all the search features available in Wagtail. +- Easy to install and adds no external dependency or service. +- Excellent performance for sites with up to 200 000 pages. + Stays decent for sites up to a million pages. +- Faster to reindex than Elasticsearch if you use PostgreSQL 9.5 or more. + +The only known **downsides** concern : + +**Downsides**: + +- ``SearchField(partial_match=True)`` is not handled. +- Due to a PostgreSQL limitation, ``SearchField(boost=…)`` is only partially + respected. It is changed so that there can only be 4 different boosts. + If you define 4 or less different boosts, + everything will be perfectly accurate. + However, your search will be a little less accurate if you define more than + 4 different boosts. That being said, it will work and be roughly the same. +- When :ref:`wagtailsearch_specifying_fields`, the index is not used, + so it will be slow on huge sites. +- Still when :ref:`wagtailsearch_specifying_fields`, you cannot search + on a specific method. + + +Installation +============ + +Add ``'wagtail.contrib.postgres_search',`` anywhere in your ``INSTALLED_APPS``: + +.. code-block:: python + + INSTALLED_APPS = [ + ... + 'wagtail.contrib.postgres_search', + ... + ] + +Then configure Wagtail to use it as a search backend. +Give it the alias `'default'` if you want it to be the default search backend: + +.. code-block:: python + + WAGTAILSEARCH_BACKENDS = { + 'default': { + 'BACKEND': 'wagtail.contrib.postgres_search.backend', + }, + } + +You then need to index data inside this backend using +the :ref:`update_index` command. You can reuse this command whenever +you want. However, it should not be needed after a first usage since +the search engine is automatically updated when data is modified. +To disable this behaviour, see :ref:`wagtailsearch_backends_auto_update`. + + +Configuration +============= + +Language / PostgreSQL search configuration +------------------------------------------ + +Use the additional ``'SEARCH_CONFIG'`` key to define which PostgreSQL +search configuration should be used. For example: + +.. code-block:: python + + WAGTAILSEARCH_BACKENDS = { + 'default': { + 'BACKEND': 'wagtail.contrib.postgres_search.backend', + 'SEARCH_CONFIG': 'english', + } + } + +As you can deduce, a PostgreSQL search configuration is mostly used to define +rules for a language, English in this case. A search configuration consists +in a compilation of algorithms (parsers & analysers) +and language specifications (stop words, stems, dictionaries, synonyms, +thesauruses, etc.). + +A few search configurations are already defined by default in PostgreSQL, +you can list them using ``sudo -u postgres psql -c "\dF"`` in a Unix shell +or by using this SQL query: ``SELECT cfgname FROM pg_catalog.pg_ts_config``. + +These already-defined search configurations are decent, but they’re basic +compared to commercial search engines. +If you want a nicer support of your language, you will have to create +your own PostgreSQL search configuration. See the PostgreSQL documentation for +`an example `_, +`the list of parsers `_ +and `a guide to use dictionaries `_. + +Atomic rebuild +-------------- + +Like the Elasticsearch backend, this backend supports +:ref:`wagtailsearch_backends_atomic_rebuild`: + +.. code-block:: python + + WAGTAILSEARCH_BACKENDS = { + 'default': { + 'BACKEND': 'wagtail.contrib.postgres_search.backend', + 'ATOMIC_REBUILD': True, + } + } + +This is nearly useless with this backend. In Elasticsearch, all data +is removed before rebuilding the index. But in this PostgreSQL backend, +only objects no longer in the database are removed. Then the index is +progressively updated, with no moment where the index is empty. + +However, if you want to be extra sure that nothing wrong happens while updating +the index, you can use atomic rebuild. The index will be rebuilt but nobody +will have access to it until reindex is complete. If any error occurs during +the operation, all changes to the index are reverted +as if reindexing never happened. diff --git a/docs/topics/search/backends.rst b/docs/topics/search/backends.rst index 712d50a591..4df1d33101 100644 --- a/docs/topics/search/backends.rst +++ b/docs/topics/search/backends.rst @@ -70,6 +70,16 @@ It also doesn't support: If any of these features are important to you, we recommend using Elasticsearch instead. +PostgreSQL Backend +------------------ + +``wagtail.contrib.postgres_search.backend`` + +If you use PostgreSQL for your database and your site has less than +a million pages, you probably want to use this backend. + +See :ref:`postgres_search` for more detail. + .. _wagtailsearch_backends_elasticsearch: diff --git a/docs/topics/search/searching.rst b/docs/topics/search/searching.rst index 61e11359a9..a9f510230f 100644 --- a/docs/topics/search/searching.rst +++ b/docs/topics/search/searching.rst @@ -82,6 +82,8 @@ You can also pass a QuerySet into the ``search`` method which allows you to add [] +.. _wagtailsearch_specifying_fields: + Specifying the fields to search ------------------------------- diff --git a/wagtail/contrib/postgres_search/__init__.py b/wagtail/contrib/postgres_search/__init__.py new file mode 100644 index 0000000000..a4dcfe6e66 --- /dev/null +++ b/wagtail/contrib/postgres_search/__init__.py @@ -0,0 +1 @@ +default_app_config = 'wagtail.contrib.postgres_search.apps.PostgresSearchConfig' diff --git a/wagtail/contrib/postgres_search/apps.py b/wagtail/contrib/postgres_search/apps.py new file mode 100644 index 0000000000..8d0b93fbce --- /dev/null +++ b/wagtail/contrib/postgres_search/apps.py @@ -0,0 +1,29 @@ +from __future__ import absolute_import, unicode_literals + +from django.apps import AppConfig +from django.core.checks import Error, Tags, register + +from .utils import ( + BOOSTS_WEIGHTS, WEIGHTS_COUNT, WEIGHTS_VALUES, determine_boosts_weights, + get_postgresql_connections) + + +class PostgresSearchConfig(AppConfig): + name = 'wagtail.contrib.postgres_search' + + def ready(self): + @register(Tags.compatibility, Tags.database) + def check_if_postgresql(app_configs, **kwargs): + if get_postgresql_connections(): + return [] + return [Error('You must use a PostgreSQL database ' + 'to use PostgreSQL search.', + id='wagtail.contrib.postgres_search.E001')] + + BOOSTS_WEIGHTS.extend(determine_boosts_weights()) + sorted_boosts_weights = sorted(BOOSTS_WEIGHTS, key=lambda t: t[0]) + max_weight = sorted_boosts_weights[-1][0] + WEIGHTS_VALUES.extend([v / max_weight + for v, w in sorted_boosts_weights]) + for _ in range(WEIGHTS_COUNT - len(WEIGHTS_VALUES)): + WEIGHTS_VALUES.insert(0, 0) diff --git a/wagtail/contrib/postgres_search/backend.py b/wagtail/contrib/postgres_search/backend.py new file mode 100644 index 0000000000..dcec8f7afb --- /dev/null +++ b/wagtail/contrib/postgres_search/backend.py @@ -0,0 +1,367 @@ +# coding: utf-8 + +from __future__ import absolute_import, unicode_literals + +from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector +from django.db import DEFAULT_DB_ALIAS, NotSupportedError, connections, transaction +from django.db.models import F, Manager, TextField, Value +from django.db.models.constants import LOOKUP_SEP +from django.db.models.functions import Cast +from django.utils.encoding import force_text, python_2_unicode_compatible +from django.utils.six import string_types + +from wagtail.wagtailsearch.backends.base import ( + BaseSearchBackend, BaseSearchQuery, BaseSearchResults) +from wagtail.wagtailsearch.index import RelatedFields, SearchField + +from .models import IndexEntry +from .utils import ( + ADD, AND, OR, WEIGHTS_VALUES, get_content_types_pks, get_postgresql_connections, get_weight, + keyword_split, unidecode) + + +# TODO: Add autocomplete. + + +def get_db_alias(queryset): + return queryset._db or DEFAULT_DB_ALIAS + + +def get_sql(queryset): + return queryset.query.get_compiler(get_db_alias(queryset)).as_sql() + + +def get_pk_column(model): + return model._meta.pk.get_attname_column()[1] + + +@python_2_unicode_compatible +class Index(object): + def __init__(self, backend, model, db_alias=None): + self.backend = backend + self.model = model + if db_alias is None: + db_alias = DEFAULT_DB_ALIAS + if connections[db_alias].vendor != 'postgresql': + raise NotSupportedError( + 'You must select a PostgreSQL database ' + 'to use PostgreSQL search.') + self.db_alias = db_alias + self.name = model._meta.label + self.search_fields = self.model.get_search_fields() + + def add_model(self, model): + pass + + def refresh(self): + pass + + def delete_stale_entries(self): + if self.model._meta.parents: + # We don’t need to delete stale entries for non-root models, + # since we already delete them by deleting roots. + return + existing_pks = (self.model._default_manager.using(self.db_alias) + .annotate(object_id=Cast('pk', TextField())) + .values('object_id')) + stale_entries = (IndexEntry._default_manager.using(self.db_alias) + .for_models(self.model) + .exclude(object_id__in=existing_pks)) + stale_entries.delete() + + def get_config(self): + return self.backend.params.get('SEARCH_CONFIG') + + def prepare_value(self, value): + if isinstance(value, string_types): + return value + if isinstance(value, list): + return ', '.join(self.prepare_value(item) for item in value) + if isinstance(value, dict): + return ', '.join(self.prepare_value(item) + for item in value.values()) + return force_text(value) + + def prepare_field(self, obj, field): + if isinstance(field, SearchField): + yield (unidecode(self.prepare_value(field.get_value(obj))), + get_weight(field.boost)) + elif isinstance(field, RelatedFields): + sub_obj = getattr(obj, field.field_name) + if sub_obj is None: + return + if callable(sub_obj): + sub_obj = sub_obj() + if isinstance(sub_obj, Manager): + sub_objs = sub_obj.all() + else: + sub_objs = [sub_obj] + for sub_obj in sub_objs: + for sub_field in field.fields: + for value in self.prepare_field(sub_obj, sub_field): + yield value + + def prepare_body(self, obj): + return [(value, boost) for field in self.search_fields + for value, boost in self.prepare_field(obj, field)] + + def add_item(self, obj): + self.add_items(self.model, [obj]) + + def add_items_upsert(self, connection, content_type_pk, objs, config): + vectors_sql = [] + data_params = [] + sql_template = ('to_tsvector(%s)' if config is None + else "to_tsvector('%s', %%s)" % config) + sql_template = 'setweight(%s, %%s)' % sql_template + for obj in objs: + data_params.extend((content_type_pk, obj._object_id)) + if obj._body_: + vectors_sql.append('||'.join(sql_template for _ in obj._body_)) + data_params.extend([v for t in obj._body_ for v in t]) + else: + vectors_sql.append("''::tsvector") + data_sql = ', '.join(['(%%s, %%s, %s)' % s for s in vectors_sql]) + with connection.cursor() as cursor: + cursor.execute(""" + INSERT INTO %s(content_type_id, object_id, body_search) + (VALUES %s) + ON CONFLICT (content_type_id, object_id) + DO UPDATE SET body_search = EXCLUDED.body_search + """ % (IndexEntry._meta.db_table, data_sql), data_params) + + def add_items_update_then_create(self, content_type_pk, objs, config): + ids_and_objs = {} + for obj in objs: + obj._search_vector = ( + ADD([ + SearchVector(Value(text), weight=weight, config=config) + for text, weight in obj._body_]) + if obj._body_ else SearchVector(Value(''))) + ids_and_objs[obj._object_id] = obj + index_entries = IndexEntry._default_manager.using(self.db_alias) + index_entries_for_ct = index_entries.filter( + content_type_id=content_type_pk) + indexed_ids = frozenset( + index_entries_for_ct.filter(object_id__in=ids_and_objs) + .values_list('object_id', flat=True)) + for indexed_id in indexed_ids: + obj = ids_and_objs[indexed_id] + index_entries_for_ct.filter(object_id=obj._object_id) \ + .update(body_search=obj._search_vector) + to_be_created = [] + for object_id in ids_and_objs: + if object_id not in indexed_ids: + to_be_created.append(IndexEntry( + content_type_id=content_type_pk, + object_id=object_id, + body_search=ids_and_objs[object_id]._search_vector, + )) + index_entries.bulk_create(to_be_created) + + def add_items(self, model, objs): + content_type_pk = get_content_types_pks((model,), self.db_alias)[0] + config = self.get_config() + for obj in objs: + obj._object_id = force_text(obj.pk) + obj._body_ = self.prepare_body(obj) + connection = connections[self.db_alias] + if connection.pg_version >= 90500: # PostgreSQL >= 9.5 + self.add_items_upsert(connection, content_type_pk, objs, config) + else: + self.add_items_update_then_create(content_type_pk, objs, config) + + def __str__(self): + return self.name + + +class PostgresSearchQuery(BaseSearchQuery): + DEFAULT_OPERATOR = 'and' + + def __init__(self, *args, **kwargs): + super(PostgresSearchQuery, self).__init__(*args, **kwargs) + self.search_fields = self.queryset.model.get_search_fields() + + def get_search_query(self, config): + combine = OR if self.operator == 'or' else AND + search_terms = keyword_split(unidecode(self.query_string)) + if not search_terms: + return SearchQuery('') + return combine(SearchQuery(q, config=config) for q in search_terms) + + def get_base_queryset(self): + # Removes order for performance’s sake. + return self.queryset.order_by() + + def get_in_index_queryset(self, queryset, search_query): + return (IndexEntry._default_manager.using(get_db_alias(queryset)) + .for_models(queryset.model).filter(body_search=search_query)) + + def get_in_index_count(self, queryset, search_query): + index_sql, index_params = get_sql( + self.get_in_index_queryset(queryset, search_query).pks()) + model_sql, model_params = get_sql(queryset) + sql = """ + SELECT COUNT(*) + FROM (%s) AS index_entry + INNER JOIN (%s) AS obj ON obj."%s" = index_entry.typed_pk; + """ % (index_sql, model_sql, get_pk_column(queryset.model)) + with connections[get_db_alias(queryset)].cursor() as cursor: + cursor.execute(sql, index_params + model_params) + return cursor.fetchone()[0] + + def get_boost(self, field_name, fields=None): + if fields is None: + fields = self.search_fields + if LOOKUP_SEP in field_name: + field_name, sub_field_name = field_name.split(LOOKUP_SEP, 1) + else: + sub_field_name = None + for field in fields: + if field.field_name == field_name: + # Note: Searching on a specific related field using + # `.search(fields=…)` is not yet supported by Wagtail. + # This method anticipates by already implementing it. + if isinstance(field, RelatedFields): + return self.get_boost(sub_field_name, field.fields) + return field.boost + + def get_in_fields_queryset(self, queryset, search_query): + if not self.fields: + return queryset.none() + return ( + queryset.annotate( + _search_=ADD( + SearchVector(field, config=search_query.config, + weight=get_weight(self.get_boost(field))) + for field in self.fields)) + .filter(_search_=search_query)) + + def search_count(self, config): + queryset = self.get_base_queryset() + search_query = self.get_search_query(config=config) + if self.fields is None: + return self.get_in_index_count(queryset, search_query) + return self.get_in_fields_queryset(queryset, search_query).count() + + def search_in_index(self, queryset, search_query, start, stop): + index_entries = self.get_in_index_queryset(queryset, search_query) + if self.order_by_relevance: + index_entries = index_entries.rank(search_query) + index_sql, index_params = get_sql(index_entries.pks()) + model_sql, model_params = get_sql(queryset) + model = queryset.model + sql = """ + SELECT obj.* + FROM (%s) AS index_entry + INNER JOIN (%s) AS obj ON obj."%s" = index_entry.typed_pk + OFFSET %%s LIMIT %%s; + """ % (index_sql, model_sql, get_pk_column(model)) + limits = (start, None if stop is None else stop - start) + return model._default_manager.using(get_db_alias(queryset)).raw( + sql, index_params + model_params + limits) + + def search_in_fields(self, queryset, search_query, start, stop): + return (self.get_in_fields_queryset(queryset, search_query) + .annotate(_rank_=SearchRank(F('_search_'), search_query, + weights=WEIGHTS_VALUES)) + .order_by('-_rank_'))[start:stop] + + def search(self, config, start, stop): + queryset = self.get_base_queryset() + if self.query_string is None: + return queryset[start:stop] + search_query = self.get_search_query(config=config) + if self.fields is None: + return self.search_in_index(queryset, search_query, start, stop) + return self.search_in_fields(queryset, search_query, start, stop) + + +class PostgresSearchResult(BaseSearchResults): + def get_config(self): + queryset = self.query.queryset + return self.backend.get_index_for_model( + queryset.model, queryset._db).get_config() + + def _do_search(self): + return list(self.query.search(self.get_config(), + self.start, self.stop)) + + def _do_count(self): + return self.query.search_count(self.get_config()) + + +class PostgresSearchRebuilder: + def __init__(self, index): + self.index = index + + def start(self): + self.index.delete_stale_entries() + return self.index + + def finish(self): + pass + + +class PostgresSearchAtomicRebuilder(PostgresSearchRebuilder): + def __init__(self, index): + super(PostgresSearchAtomicRebuilder, self).__init__(index) + self.transaction = transaction.atomic(using=index.db_alias) + self.transaction_opened = False + + def start(self): + self.transaction.__enter__() + self.transaction_opened = True + return super(PostgresSearchAtomicRebuilder, self).start() + + def finish(self): + self.transaction.__exit__(None, None, None) + self.transaction_opened = False + + def __del__(self): + # TODO: Implement a cleaner way to close the connection on failure. + if self.transaction_opened: + self.transaction.needs_rollback = True + self.finish() + + +class PostgresSearchBackend(BaseSearchBackend): + query_class = PostgresSearchQuery + results_class = PostgresSearchResult + rebuilder_class = PostgresSearchRebuilder + atomic_rebuilder_class = PostgresSearchAtomicRebuilder + + def __init__(self, params): + super(PostgresSearchBackend, self).__init__(params) + self.params = params + if params.get('ATOMIC_REBUILD', False): + self.rebuilder_class = self.atomic_rebuilder_class + + def get_index_for_model(self, model, db_alias=None): + return Index(self, model, db_alias) + + def get_index_for_object(self, obj): + return self.get_index_for_model(obj._meta.model, obj._state.db) + + def reset_index(self): + for connection in get_postgresql_connections(): + IndexEntry._default_manager.using(connection.alias).delete() + + def add_type(self, model): + pass # Not needed. + + def refresh_index(self): + pass # Not needed. + + def add(self, obj): + self.get_index_for_object(obj).add_item(obj) + + def add_bulk(self, model, obj_list): + if obj_list: + self.get_index_for_object(obj_list[0]).add_items(model, obj_list) + + def delete(self, obj): + IndexEntry._default_manager.for_object(obj).delete() + + +SearchBackend = PostgresSearchBackend diff --git a/wagtail/contrib/postgres_search/migrations/0001_initial.py b/wagtail/contrib/postgres_search/migrations/0001_initial.py new file mode 100644 index 0000000000..685526071d --- /dev/null +++ b/wagtail/contrib/postgres_search/migrations/0001_initial.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.10.1 on 2017-03-22 14:53 +from __future__ import unicode_literals + +import django.db.models.deletion + +from django.db import migrations, models + +import django.contrib.postgres.fields.jsonb +import django.contrib.postgres.search +from ..models import IndexEntry + + +table = IndexEntry._meta.db_table + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('contenttypes', '0002_remove_content_type_name'), + ] + + operations = [ + migrations.CreateModel( + name='IndexEntry', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('object_id', models.TextField()), + ('body_search', django.contrib.postgres.search.SearchVectorField()), + ('content_type', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='contenttypes.ContentType')), + ], + options={ + 'verbose_name_plural': 'index entries', + 'verbose_name': 'index entry', + }, + ), + migrations.AlterUniqueTogether( + name='indexentry', + unique_together=set([('content_type', 'object_id')]), + ), + migrations.RunSQL( + 'CREATE INDEX {0}_body_search ON {0} ' + 'USING GIN(body_search);'.format(table), + 'DROP INDEX {}_body_search;'.format(table), + ), + ] diff --git a/wagtail/contrib/postgres_search/migrations/__init__.py b/wagtail/contrib/postgres_search/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/wagtail/contrib/postgres_search/models.py b/wagtail/contrib/postgres_search/models.py new file mode 100644 index 0000000000..8826d91288 --- /dev/null +++ b/wagtail/contrib/postgres_search/models.py @@ -0,0 +1,71 @@ +from __future__ import absolute_import, unicode_literals + +from django.contrib.contenttypes.fields import GenericForeignKey +from django.contrib.contenttypes.models import ContentType +from django.contrib.postgres.search import SearchRank, SearchVectorField +from django.db.models import ( + CASCADE, AutoField, BigAutoField, BigIntegerField, F, ForeignKey, IntegerField, Model, QuerySet, + TextField) +from django.db.models.functions import Cast +from django.utils.encoding import force_text, python_2_unicode_compatible +from django.utils.translation import ugettext_lazy as _ + +from .utils import WEIGHTS_VALUES, get_descendants_content_types_pks + + +class IndexQuerySet(QuerySet): + def for_models(self, *models): + if not models: + return self.none() + return self.filter( + content_type_id__in=get_descendants_content_types_pks(models, + self._db)) + + def for_object(self, obj): + db_alias = obj._state.db + return (self.using(db_alias).for_models(obj._meta.model) + .filter(object_id=force_text(obj.pk))) + + def add_rank(self, search_query): + return self.annotate( + rank=SearchRank( + F('body_search'), search_query, + weights='{' + ','.join(map(str, WEIGHTS_VALUES)) + '}')) + + def rank(self, search_query): + return self.add_rank(search_query).order_by('-rank') + + def pks(self): + cast_field = self.model._meta.pk + if isinstance(cast_field, BigAutoField): + cast_field = BigIntegerField() + elif isinstance(cast_field, AutoField): + cast_field = IntegerField() + return (self.annotate(typed_pk=Cast('object_id', cast_field)) + .values_list('typed_pk', flat=True)) + + +@python_2_unicode_compatible +class IndexEntry(Model): + content_type = ForeignKey(ContentType, on_delete=CASCADE) + # We do not use an IntegerField since primary keys are not always integers. + object_id = TextField() + content_object = GenericForeignKey() + + # TODO: Add per-object boosting. + body_search = SearchVectorField() + + objects = IndexQuerySet.as_manager() + + class Meta: + unique_together = ('content_type', 'object_id') + verbose_name = _('index entry') + verbose_name_plural = _('index entries') + # TODO: Move here the GIN index from the migration. + + def __str__(self): + return '%s: %s' % (self.content_type.name, self.content_object) + + @property + def model(self): + return self.content_type.model diff --git a/wagtail/contrib/postgres_search/tests/__init__.py b/wagtail/contrib/postgres_search/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/wagtail/contrib/postgres_search/tests/test_backend.py b/wagtail/contrib/postgres_search/tests/test_backend.py new file mode 100644 index 0000000000..9ce885f0c5 --- /dev/null +++ b/wagtail/contrib/postgres_search/tests/test_backend.py @@ -0,0 +1,42 @@ +from __future__ import absolute_import, unicode_literals + +from django.core.management import call_command +from django.test import TestCase +from django.utils.six import StringIO + +from wagtail.tests.search.models import SearchTest +from wagtail.wagtailsearch.tests.test_backends import BackendTests + + +class TestPostgresSearchBackend(BackendTests, TestCase): + backend_path = 'wagtail.contrib.postgres_search.backend' + + def test_update_index_command(self): + self.backend.reset_index() + + results = self.backend.search(None, SearchTest) + # We find results anyway because we searched for nothing. + self.assertSetEqual(set(results), + {self.testa, self.testb, self.testc.searchtest_ptr, + self.testd.searchtest_ptr}) + + # But now, we can't find anything because the index is empty. + results = self.backend.search('hello', SearchTest) + self.assertSetEqual(set(results), set()) + results = self.backend.search('world', SearchTest) + self.assertSetEqual(set(results), set()) + + # Run update_index command + with self.ignore_deprecation_warnings(): + # ignore any DeprecationWarnings thrown by models with old-style + # indexed_fields definitions + call_command('update_index', backend_name=self.backend_name, + interactive=False, stdout=StringIO()) + + # And now we can finally find results. + results = self.backend.search('hello', SearchTest) + self.assertSetEqual(set(results), {self.testa, self.testb, + self.testc.searchtest_ptr}) + results = self.backend.search('world', SearchTest) + self.assertSetEqual(set(results), {self.testa, + self.testd.searchtest_ptr}) diff --git a/wagtail/contrib/postgres_search/utils.py b/wagtail/contrib/postgres_search/utils.py new file mode 100644 index 0000000000..0d8393c0d4 --- /dev/null +++ b/wagtail/contrib/postgres_search/utils.py @@ -0,0 +1,118 @@ +from __future__ import absolute_import, unicode_literals + +import operator +import re +from functools import partial, reduce + +from django.apps import apps +from django.db import connections +from django.db.models import Q +from django.utils.lru_cache import lru_cache + +from wagtail.wagtailsearch.index import Indexed, RelatedFields, SearchField + +try: + # Only use the GPLv2 licensed unidecode if it's installed. + from unidecode import unidecode +except ImportError: + def unidecode(value): + return value + + +def get_postgresql_connections(): + return [connection for connection in connections.all() + if connection.vendor == 'postgresql'] + + +# Reduce any iterable to a single value using a logical OR e.g. (a | b | ...) +OR = partial(reduce, operator.or_) +# Reduce any iterable to a single value using a logical AND e.g. (a & b & ...) +AND = partial(reduce, operator.and_) +# Reduce any iterable to a single value using an addition +ADD = partial(reduce, operator.add) + + +def keyword_split(keywords): + """ + Return all the keywords in a keyword string. + + Keeps keywords surrounded by quotes together, removing the surrounding quotes: + + >>> keyword_split('Hello I\\'m looking for "something special"') + ['Hello', "I'm", 'looking', 'for', 'something special'] + + Nested quoted strings are returned as is: + + >>> keyword_split("He said \\"I'm looking for 'something special'\\" so I've given him the 'special item'") + ['He', 'said', "I'm looking for 'something special'", 'so', "I've", 'given', 'him', 'the', 'special item'] + + """ + matches = re.findall(r'"([^"]+)"|\'([^\']+)\'|(\S+)', keywords) + return [match[0] or match[1] or match[2] for match in matches] + + +def get_descendant_models(model): + """ + Returns all descendants of a model, including the model itself. + """ + descendant_models = {other_model for other_model in apps.get_models() + if issubclass(other_model, model)} + descendant_models.add(model) + return descendant_models + + +def get_descendants_content_types_pks(models, db_alias): + return get_content_types_pks( + tuple(descendant_model for model in models + for descendant_model in get_descendant_models(model)), db_alias) + + +@lru_cache() +def get_content_types_pks(models, db_alias): + # We import it locally because this file is loaded before apps are ready. + from django.contrib.contenttypes.models import ContentType + return list(ContentType._default_manager.using(db_alias) + .filter(OR([Q(app_label=model._meta.app_label, + model=model._meta.model_name) + for model in models])) + .values_list('pk', flat=True)) + + +def get_search_fields(search_fields): + for search_field in search_fields: + if isinstance(search_field, SearchField): + yield search_field + elif isinstance(search_field, RelatedFields): + for sub_field in get_search_fields(search_field.fields): + yield sub_field + + +WEIGHTS = 'ABCD' +WEIGHTS_COUNT = len(WEIGHTS) +# These are filled when apps are ready. +BOOSTS_WEIGHTS = [] +WEIGHTS_VALUES = [] + + +def determine_boosts_weights(): + boosts = set() + for model in apps.get_models(): + if issubclass(model, Indexed): + for search_field in get_search_fields(model.get_search_fields()): + boost = search_field.boost + boosts.add(0 if boost is None else boost) + if len(boosts) <= WEIGHTS_COUNT: + return zip(reversed(sorted(boosts)), WEIGHTS) + min_boost = min(boosts) + max_boost = max(boosts) + boost_step = (max_boost - min_boost) / WEIGHTS_COUNT + return [(min_boost + (i * boost_step), weight) + for i, weight in zip(range(WEIGHTS_COUNT), WEIGHTS)] + + +def get_weight(boost): + if boost is None: + boost = 0 + for max_boost, weight in BOOSTS_WEIGHTS: + if boost >= max_boost: + return weight diff --git a/wagtail/tests/settings.py b/wagtail/tests/settings.py index 008361d7fa..5302e6a65d 100644 --- a/wagtail/tests/settings.py +++ b/wagtail/tests/settings.py @@ -178,6 +178,15 @@ WAGTAILSEARCH_BACKENDS = { AUTH_USER_MODEL = 'customuser.CustomUser' +if django.VERSION >= (1, 10) and os.environ.get('DATABASE_ENGINE') in ( + # Remove next line when Django 1.8 support is dropped. + 'django.db.backends.postgresql_psycopg2', + 'django.db.backends.postgresql'): + INSTALLED_APPS += ('wagtail.contrib.postgres_search',) + WAGTAILSEARCH_BACKENDS['postgresql'] = { + 'BACKEND': 'wagtail.contrib.postgres_search.backend', + } + if 'ELASTICSEARCH_URL' in os.environ: if os.environ.get('ELASTICSEARCH_VERSION') == '5': backend = 'wagtail.wagtailsearch.backends.elasticsearch5'