Merge pull request #3940 from BertrandBordage/postgres_search_improvements

Postgres_search simplification.
pull/3965/head
Karl Hobley 2017-10-19 16:27:48 +01:00 zatwierdzone przez GitHub
commit 6514650aa4
7 zmienionych plików z 95 dodań i 157 usunięć

Wyświetl plik

@ -33,7 +33,7 @@ def pytest_configure(config):
pass pass
if config.getoption('postgres'): if config.getoption('postgres'):
os.environ['DATABASE_ENGINE'] = 'django.db.backends.postgresql_psycopg2' os.environ['DATABASE_ENGINE'] = 'django.db.backends.postgresql'
# Setup django after processing the pytest arguments so that the env # Setup django after processing the pytest arguments so that the env
# variables are available in the settings # variables are available in the settings

Wyświetl plik

@ -560,7 +560,7 @@ These two files should reside in your project directory (``myproject/myproject/`
DATABASES = { DATABASES = {
'default': { 'default': {
'ENGINE': 'django.db.backends.postgresql_psycopg2', 'ENGINE': 'django.db.backends.postgresql',
'NAME': 'myprojectdb', 'NAME': 'myprojectdb',
'USER': 'postgres', 'USER': 'postgres',
'PASSWORD': '', 'PASSWORD': '',

Wyświetl plik

@ -47,7 +47,7 @@ def runtests():
pass pass
if args.postgres: if args.postgres:
os.environ['DATABASE_ENGINE'] = 'django.db.backends.postgresql_psycopg2' os.environ['DATABASE_ENGINE'] = 'django.db.backends.postgresql'
if args.elasticsearch: if args.elasticsearch:
os.environ.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200') os.environ.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200')

Wyświetl plik

@ -16,25 +16,13 @@ from wagtail.wagtailsearch.index import RelatedFields, SearchField
from .models import IndexEntry from .models import IndexEntry
from .utils import ( from .utils import (
ADD, AND, OR, WEIGHTS_VALUES, get_content_types_pks, get_postgresql_connections, get_weight, ADD, AND, OR, WEIGHTS_VALUES, get_content_types_pk, get_descendants_content_types_pks,
keyword_split, unidecode) get_postgresql_connections, get_weight, keyword_split, unidecode)
# TODO: Add autocomplete. # TODO: Add autocomplete.
def get_db_alias(queryset):
return queryset._db or DEFAULT_DB_ALIAS
def get_sql(queryset):
return queryset.query.get_compiler(get_db_alias(queryset)).as_sql()
def get_pk_column(model):
return model._meta.pk.get_attname_column()[1]
@python_2_unicode_compatible @python_2_unicode_compatible
class Index(object): class Index(object):
def __init__(self, backend, model, db_alias=None): def __init__(self, backend, model, db_alias=None):
@ -64,14 +52,13 @@ class Index(object):
existing_pks = (self.model._default_manager.using(self.db_alias) existing_pks = (self.model._default_manager.using(self.db_alias)
.annotate(object_id=Cast('pk', TextField())) .annotate(object_id=Cast('pk', TextField()))
.values('object_id')) .values('object_id'))
stale_entries = (IndexEntry._default_manager.using(self.db_alias) content_type_ids = get_descendants_content_types_pks(self.model)
.for_models(self.model) stale_entries = (
.exclude(object_id__in=existing_pks)) IndexEntry._default_manager.using(self.db_alias)
.filter(content_type_id__in=content_type_ids)
.exclude(object_id__in=existing_pks))
stale_entries.delete() stale_entries.delete()
def get_config(self):
return self.backend.params.get('SEARCH_CONFIG')
def prepare_value(self, value): def prepare_value(self, value):
if isinstance(value, string_types): if isinstance(value, string_types):
return value return value
@ -134,9 +121,8 @@ class Index(object):
ids_and_objs = {} ids_and_objs = {}
for obj in objs: for obj in objs:
obj._search_vector = ( obj._search_vector = (
ADD([ ADD([SearchVector(Value(text), weight=weight, config=config)
SearchVector(Value(text), weight=weight, config=config) for text, weight in obj._body_])
for text, weight in obj._body_])
if obj._body_ else SearchVector(Value(''))) if obj._body_ else SearchVector(Value('')))
ids_and_objs[obj._object_id] = obj ids_and_objs[obj._object_id] = obj
index_entries = IndexEntry._default_manager.using(self.db_alias) index_entries = IndexEntry._default_manager.using(self.db_alias)
@ -160,8 +146,8 @@ class Index(object):
index_entries.bulk_create(to_be_created) index_entries.bulk_create(to_be_created)
def add_items(self, model, objs): def add_items(self, model, objs):
content_type_pk = get_content_types_pks((model,), self.db_alias)[0] content_type_pk = get_content_types_pk(model)
config = self.get_config() config = self.backend.get_config()
for obj in objs: for obj in objs:
obj._object_id = force_text(obj.pk) obj._object_id = force_text(obj.pk)
obj._body_ = self.prepare_body(obj) obj._body_ = self.prepare_body(obj)
@ -189,27 +175,6 @@ class PostgresSearchQuery(BaseSearchQuery):
return SearchQuery('') return SearchQuery('')
return combine(SearchQuery(q, config=config) for q in search_terms) return combine(SearchQuery(q, config=config) for q in search_terms)
def get_base_queryset(self):
# Removes order for performances sake.
return self.queryset.order_by()
def get_in_index_queryset(self, queryset, search_query):
return (IndexEntry._default_manager.using(get_db_alias(queryset))
.for_models(queryset.model).filter(body_search=search_query))
def get_in_index_count(self, queryset, search_query):
index_sql, index_params = get_sql(
self.get_in_index_queryset(queryset, search_query).pks())
model_sql, model_params = get_sql(queryset)
sql = """
SELECT COUNT(*)
FROM (%s) AS index_entry
INNER JOIN (%s) AS obj ON obj."%s" = index_entry.typed_pk;
""" % (index_sql, model_sql, get_pk_column(queryset.model))
with connections[get_db_alias(queryset)].cursor() as cursor:
cursor.execute(sql, index_params + model_params)
return cursor.fetchone()[0]
def get_boost(self, field_name, fields=None): def get_boost(self, field_name, fields=None):
if fields is None: if fields is None:
fields = self.search_fields fields = self.search_fields
@ -226,78 +191,43 @@ class PostgresSearchQuery(BaseSearchQuery):
return self.get_boost(sub_field_name, field.fields) return self.get_boost(sub_field_name, field.fields)
return field.boost return field.boost
def get_in_fields_queryset(self, queryset, search_query):
if not self.fields:
return queryset.none()
return (
queryset.annotate(
_search_=ADD(
SearchVector(field, config=search_query.config,
weight=get_weight(self.get_boost(field)))
for field in self.fields))
.filter(_search_=search_query))
def search_count(self, config):
queryset = self.get_base_queryset()
search_query = self.get_search_query(config=config)
if self.fields is None:
return self.get_in_index_count(queryset, search_query)
return self.get_in_fields_queryset(queryset, search_query).count()
def search_in_index(self, queryset, search_query, start, stop):
index_entries = self.get_in_index_queryset(queryset, search_query)
values = ['typed_pk']
if self.order_by_relevance:
index_entries = index_entries.rank(search_query)
values.append('rank')
order_sql = 'index_entry.rank DESC, id ASC'
else:
order_sql = 'id ASC'
index_sql, index_params = get_sql(
index_entries.annotate_typed_pk()
.values(*values)
)
model_sql, model_params = get_sql(queryset)
model = queryset.model
sql = """
SELECT obj.*
FROM (%s) AS index_entry
INNER JOIN (%s) AS obj ON obj."%s" = index_entry.typed_pk
ORDER BY %s
OFFSET %%s LIMIT %%s;
""" % (index_sql, model_sql, get_pk_column(model), order_sql)
limits = (start, None if stop is None else stop - start)
return model._default_manager.using(get_db_alias(queryset)).raw(
sql, index_params + model_params + limits)
def search_in_fields(self, queryset, search_query, start, stop):
return (self.get_in_fields_queryset(queryset, search_query)
.annotate(_rank_=SearchRank(F('_search_'), search_query,
weights=WEIGHTS_VALUES))
.order_by('-_rank_'))[start:stop]
def search(self, config, start, stop): def search(self, config, start, stop):
queryset = self.get_base_queryset()
if self.query_string is None: if self.query_string is None:
return queryset[start:stop] return self.queryset[start:stop]
search_query = self.get_search_query(config=config) search_query = self.get_search_query(config=config)
queryset = self.queryset
query = queryset.query
if self.fields is None: if self.fields is None:
return self.search_in_index(queryset, search_query, start, stop) vector = F('index_entries__body_search')
return self.search_in_fields(queryset, search_query, start, stop) else:
vector = ADD(
SearchVector(field, config=search_query.config,
weight=get_weight(self.get_boost(field)))
for field in self.fields)
vector = vector.resolve_expression(query)
search_query = search_query.resolve_expression(query)
lookup = IndexEntry._meta.get_field('body_search').get_lookup('exact')(
vector, search_query)
query.where.add(lookup, 'AND')
if self.order_by_relevance:
# Due to a Django bug, arrays are not automatically converted here.
converted_weights = '{' + ','.join(map(str, WEIGHTS_VALUES)) + '}'
queryset = queryset.order_by(SearchRank(vector, search_query,
weights=converted_weights).desc(),
'-pk')
elif not queryset.query.order_by:
# Adds a default ordering to avoid issue #3729.
queryset = queryset.order_by('-pk')
return queryset[start:stop]
class PostgresSearchResults(BaseSearchResults): class PostgresSearchResults(BaseSearchResults):
def get_config(self):
queryset = self.query.queryset
return self.backend.get_index_for_model(
queryset.model, queryset._db).get_config()
def _do_search(self): def _do_search(self):
return list(self.query.search(self.get_config(), return list(self.query.search(self.backend.get_config(),
self.start, self.stop)) self.start, self.stop))
def _do_count(self): def _do_count(self):
return self.query.search_count(self.get_config()) return self.query.search(self.backend.get_config(), None, None).count()
class PostgresSearchRebuilder: class PostgresSearchRebuilder:
@ -345,6 +275,10 @@ class PostgresSearchBackend(BaseSearchBackend):
self.params = params self.params = params
if params.get('ATOMIC_REBUILD', False): if params.get('ATOMIC_REBUILD', False):
self.rebuilder_class = self.atomic_rebuilder_class self.rebuilder_class = self.atomic_rebuilder_class
IndexEntry.add_generic_relations()
def get_config(self):
return self.params.get('SEARCH_CONFIG')
def get_index_for_model(self, model, db_alias=None): def get_index_for_model(self, model, db_alias=None):
return Index(self, model, db_alias) return Index(self, model, db_alias)
@ -370,7 +304,7 @@ class PostgresSearchBackend(BaseSearchBackend):
self.get_index_for_object(obj_list[0]).add_items(model, obj_list) self.get_index_for_object(obj_list[0]).add_items(model, obj_list)
def delete(self, obj): def delete(self, obj):
IndexEntry._default_manager.for_object(obj).delete() obj.index_entries.all().delete()
SearchBackend = PostgresSearchBackend SearchBackend = PostgresSearchBackend

Wyświetl plik

@ -44,5 +44,11 @@ class Migration(migrations.Migration):
'CREATE INDEX {0}_body_search ON {0} ' 'CREATE INDEX {0}_body_search ON {0} '
'USING GIN(body_search);'.format(table), 'USING GIN(body_search);'.format(table),
'DROP INDEX {}_body_search;'.format(table), 'DROP INDEX {}_body_search;'.format(table),
state_operations=[migrations.AddIndex(
model_name='indexentry',
index=django.contrib.postgres.indexes.GinIndex(
fields=['body_search'],
name='postgres_se_body_se_70ba1a_gin'),
)],
), ),
] ]

Wyświetl plik

@ -1,50 +1,43 @@
from __future__ import absolute_import, unicode_literals from __future__ import absolute_import, unicode_literals
from django.contrib.contenttypes.fields import GenericForeignKey from django.apps import apps
from django.contrib.contenttypes.fields import GenericForeignKey, GenericRelation
from django.contrib.contenttypes.models import ContentType from django.contrib.contenttypes.models import ContentType
from django.contrib.postgres.search import SearchRank, SearchVectorField from django.contrib.postgres.indexes import GinIndex
from django.db.models import ( from django.contrib.postgres.search import SearchVectorField
CASCADE, AutoField, BigAutoField, BigIntegerField, F, ForeignKey, IntegerField, Model, QuerySet, from django.db.models import CASCADE, ForeignKey, Model, TextField
TextField)
from django.db.models.functions import Cast from django.db.models.functions import Cast
from django.utils.encoding import force_text, python_2_unicode_compatible from django.utils.encoding import python_2_unicode_compatible
from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext_lazy as _
from .utils import WEIGHTS_VALUES, get_descendants_content_types_pks from ...wagtailsearch.index import class_is_indexed
from .utils import get_descendants_content_types_pks
class IndexQuerySet(QuerySet): class TextIDGenericRelation(GenericRelation):
def for_models(self, *models): def get_content_type_lookup(self, alias, remote_alias):
if not models: field = self.remote_field.model._meta.get_field(
return self.none() self.content_type_field_name)
return self.filter( return field.get_lookup('in')(
content_type_id__in=get_descendants_content_types_pks(models, field.get_col(remote_alias),
self._db)) get_descendants_content_types_pks(self.model))
def for_object(self, obj): def get_object_id_lookup(self, alias, remote_alias):
db_alias = obj._state.db from_field = self.remote_field.model._meta.get_field(
return (self.using(db_alias).for_models(obj._meta.model) self.object_id_field_name)
.filter(object_id=force_text(obj.pk))) to_field = self.model._meta.pk
return from_field.get_lookup('exact')(
from_field.get_col(remote_alias),
Cast(to_field.get_col(alias), from_field))
def add_rank(self, search_query): def get_extra_restriction(self, where_class, alias, remote_alias):
return self.annotate( cond = where_class()
rank=SearchRank( cond.add(self.get_content_type_lookup(alias, remote_alias), 'AND')
F('body_search'), search_query, cond.add(self.get_object_id_lookup(alias, remote_alias), 'AND')
weights='{' + ','.join(map(str, WEIGHTS_VALUES)) + '}')) return cond
def rank(self, search_query): def resolve_related_fields(self):
return self.add_rank(search_query).order_by('-rank') return []
def annotate_typed_pk(self):
cast_field = self.model._meta.pk
if isinstance(cast_field, BigAutoField):
cast_field = BigIntegerField()
elif isinstance(cast_field, AutoField):
cast_field = IntegerField()
return self.annotate(typed_pk=Cast('object_id', cast_field))
def pks(self):
return self.annotate_typed_pk().values_list('typed_pk', flat=True)
@python_2_unicode_compatible @python_2_unicode_compatible
@ -57,13 +50,11 @@ class IndexEntry(Model):
# TODO: Add per-object boosting. # TODO: Add per-object boosting.
body_search = SearchVectorField() body_search = SearchVectorField()
objects = IndexQuerySet.as_manager()
class Meta: class Meta:
unique_together = ('content_type', 'object_id') unique_together = ('content_type', 'object_id')
verbose_name = _('index entry') verbose_name = _('index entry')
verbose_name_plural = _('index entries') verbose_name_plural = _('index entries')
# TODO: Move here the GIN index from the migration. indexes = [GinIndex(['body_search'])]
def __str__(self): def __str__(self):
return '%s: %s' % (self.content_type.name, self.content_object) return '%s: %s' % (self.content_type.name, self.content_object)
@ -71,3 +62,10 @@ class IndexEntry(Model):
@property @property
def model(self): def model(self):
return self.content_type.model return self.content_type.model
@classmethod
def add_generic_relations(cls):
for model in apps.get_models():
if class_is_indexed(model):
TextIDGenericRelation(cls).contribute_to_class(model,
'index_entries')

Wyświetl plik

@ -60,17 +60,17 @@ def get_descendant_models(model):
return descendant_models return descendant_models
def get_descendants_content_types_pks(models, db_alias): def get_descendants_content_types_pks(model):
return get_content_types_pks( from django.contrib.contenttypes.models import ContentType
tuple(descendant_model for model in models return [ct.pk for ct in
for descendant_model in get_descendant_models(model)), db_alias) ContentType.objects.get_for_models(*get_descendant_models(model))
.values()]
def get_content_types_pks(models, db_alias): def get_content_types_pk(model):
# We import it locally because this file is loaded before apps are ready. # We import it locally because this file is loaded before apps are ready.
from django.contrib.contenttypes.models import ContentType from django.contrib.contenttypes.models import ContentType
content_types_dict = ContentType.objects.db_manager(db_alias).get_for_models(*models) return ContentType.objects.get_for_model(model).pk
return [ct.pk for ct in content_types_dict.values()]
def get_search_fields(search_fields): def get_search_fields(search_fields):