Merge pull request #3940 from BertrandBordage/postgres_search_improvements

Postgres_search simplification.
pull/3965/head
Karl Hobley 2017-10-19 16:27:48 +01:00 zatwierdzone przez GitHub
commit 6514650aa4
7 zmienionych plików z 95 dodań i 157 usunięć

Wyświetl plik

@ -33,7 +33,7 @@ def pytest_configure(config):
pass
if config.getoption('postgres'):
os.environ['DATABASE_ENGINE'] = 'django.db.backends.postgresql_psycopg2'
os.environ['DATABASE_ENGINE'] = 'django.db.backends.postgresql'
# Setup django after processing the pytest arguments so that the env
# variables are available in the settings

Wyświetl plik

@ -560,7 +560,7 @@ These two files should reside in your project directory (``myproject/myproject/`
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql_psycopg2',
'ENGINE': 'django.db.backends.postgresql',
'NAME': 'myprojectdb',
'USER': 'postgres',
'PASSWORD': '',

Wyświetl plik

@ -47,7 +47,7 @@ def runtests():
pass
if args.postgres:
os.environ['DATABASE_ENGINE'] = 'django.db.backends.postgresql_psycopg2'
os.environ['DATABASE_ENGINE'] = 'django.db.backends.postgresql'
if args.elasticsearch:
os.environ.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200')

Wyświetl plik

@ -16,25 +16,13 @@ from wagtail.wagtailsearch.index import RelatedFields, SearchField
from .models import IndexEntry
from .utils import (
ADD, AND, OR, WEIGHTS_VALUES, get_content_types_pks, get_postgresql_connections, get_weight,
keyword_split, unidecode)
ADD, AND, OR, WEIGHTS_VALUES, get_content_types_pk, get_descendants_content_types_pks,
get_postgresql_connections, get_weight, keyword_split, unidecode)
# TODO: Add autocomplete.
def get_db_alias(queryset):
return queryset._db or DEFAULT_DB_ALIAS
def get_sql(queryset):
return queryset.query.get_compiler(get_db_alias(queryset)).as_sql()
def get_pk_column(model):
return model._meta.pk.get_attname_column()[1]
@python_2_unicode_compatible
class Index(object):
def __init__(self, backend, model, db_alias=None):
@ -64,14 +52,13 @@ class Index(object):
existing_pks = (self.model._default_manager.using(self.db_alias)
.annotate(object_id=Cast('pk', TextField()))
.values('object_id'))
stale_entries = (IndexEntry._default_manager.using(self.db_alias)
.for_models(self.model)
.exclude(object_id__in=existing_pks))
content_type_ids = get_descendants_content_types_pks(self.model)
stale_entries = (
IndexEntry._default_manager.using(self.db_alias)
.filter(content_type_id__in=content_type_ids)
.exclude(object_id__in=existing_pks))
stale_entries.delete()
def get_config(self):
return self.backend.params.get('SEARCH_CONFIG')
def prepare_value(self, value):
if isinstance(value, string_types):
return value
@ -134,9 +121,8 @@ class Index(object):
ids_and_objs = {}
for obj in objs:
obj._search_vector = (
ADD([
SearchVector(Value(text), weight=weight, config=config)
for text, weight in obj._body_])
ADD([SearchVector(Value(text), weight=weight, config=config)
for text, weight in obj._body_])
if obj._body_ else SearchVector(Value('')))
ids_and_objs[obj._object_id] = obj
index_entries = IndexEntry._default_manager.using(self.db_alias)
@ -160,8 +146,8 @@ class Index(object):
index_entries.bulk_create(to_be_created)
def add_items(self, model, objs):
content_type_pk = get_content_types_pks((model,), self.db_alias)[0]
config = self.get_config()
content_type_pk = get_content_types_pk(model)
config = self.backend.get_config()
for obj in objs:
obj._object_id = force_text(obj.pk)
obj._body_ = self.prepare_body(obj)
@ -189,27 +175,6 @@ class PostgresSearchQuery(BaseSearchQuery):
return SearchQuery('')
return combine(SearchQuery(q, config=config) for q in search_terms)
def get_base_queryset(self):
# Removes order for performances sake.
return self.queryset.order_by()
def get_in_index_queryset(self, queryset, search_query):
return (IndexEntry._default_manager.using(get_db_alias(queryset))
.for_models(queryset.model).filter(body_search=search_query))
def get_in_index_count(self, queryset, search_query):
index_sql, index_params = get_sql(
self.get_in_index_queryset(queryset, search_query).pks())
model_sql, model_params = get_sql(queryset)
sql = """
SELECT COUNT(*)
FROM (%s) AS index_entry
INNER JOIN (%s) AS obj ON obj."%s" = index_entry.typed_pk;
""" % (index_sql, model_sql, get_pk_column(queryset.model))
with connections[get_db_alias(queryset)].cursor() as cursor:
cursor.execute(sql, index_params + model_params)
return cursor.fetchone()[0]
def get_boost(self, field_name, fields=None):
if fields is None:
fields = self.search_fields
@ -226,78 +191,43 @@ class PostgresSearchQuery(BaseSearchQuery):
return self.get_boost(sub_field_name, field.fields)
return field.boost
def get_in_fields_queryset(self, queryset, search_query):
if not self.fields:
return queryset.none()
return (
queryset.annotate(
_search_=ADD(
SearchVector(field, config=search_query.config,
weight=get_weight(self.get_boost(field)))
for field in self.fields))
.filter(_search_=search_query))
def search_count(self, config):
queryset = self.get_base_queryset()
search_query = self.get_search_query(config=config)
if self.fields is None:
return self.get_in_index_count(queryset, search_query)
return self.get_in_fields_queryset(queryset, search_query).count()
def search_in_index(self, queryset, search_query, start, stop):
index_entries = self.get_in_index_queryset(queryset, search_query)
values = ['typed_pk']
if self.order_by_relevance:
index_entries = index_entries.rank(search_query)
values.append('rank')
order_sql = 'index_entry.rank DESC, id ASC'
else:
order_sql = 'id ASC'
index_sql, index_params = get_sql(
index_entries.annotate_typed_pk()
.values(*values)
)
model_sql, model_params = get_sql(queryset)
model = queryset.model
sql = """
SELECT obj.*
FROM (%s) AS index_entry
INNER JOIN (%s) AS obj ON obj."%s" = index_entry.typed_pk
ORDER BY %s
OFFSET %%s LIMIT %%s;
""" % (index_sql, model_sql, get_pk_column(model), order_sql)
limits = (start, None if stop is None else stop - start)
return model._default_manager.using(get_db_alias(queryset)).raw(
sql, index_params + model_params + limits)
def search_in_fields(self, queryset, search_query, start, stop):
return (self.get_in_fields_queryset(queryset, search_query)
.annotate(_rank_=SearchRank(F('_search_'), search_query,
weights=WEIGHTS_VALUES))
.order_by('-_rank_'))[start:stop]
def search(self, config, start, stop):
queryset = self.get_base_queryset()
if self.query_string is None:
return queryset[start:stop]
return self.queryset[start:stop]
search_query = self.get_search_query(config=config)
queryset = self.queryset
query = queryset.query
if self.fields is None:
return self.search_in_index(queryset, search_query, start, stop)
return self.search_in_fields(queryset, search_query, start, stop)
vector = F('index_entries__body_search')
else:
vector = ADD(
SearchVector(field, config=search_query.config,
weight=get_weight(self.get_boost(field)))
for field in self.fields)
vector = vector.resolve_expression(query)
search_query = search_query.resolve_expression(query)
lookup = IndexEntry._meta.get_field('body_search').get_lookup('exact')(
vector, search_query)
query.where.add(lookup, 'AND')
if self.order_by_relevance:
# Due to a Django bug, arrays are not automatically converted here.
converted_weights = '{' + ','.join(map(str, WEIGHTS_VALUES)) + '}'
queryset = queryset.order_by(SearchRank(vector, search_query,
weights=converted_weights).desc(),
'-pk')
elif not queryset.query.order_by:
# Adds a default ordering to avoid issue #3729.
queryset = queryset.order_by('-pk')
return queryset[start:stop]
class PostgresSearchResults(BaseSearchResults):
def get_config(self):
queryset = self.query.queryset
return self.backend.get_index_for_model(
queryset.model, queryset._db).get_config()
def _do_search(self):
return list(self.query.search(self.get_config(),
return list(self.query.search(self.backend.get_config(),
self.start, self.stop))
def _do_count(self):
return self.query.search_count(self.get_config())
return self.query.search(self.backend.get_config(), None, None).count()
class PostgresSearchRebuilder:
@ -345,6 +275,10 @@ class PostgresSearchBackend(BaseSearchBackend):
self.params = params
if params.get('ATOMIC_REBUILD', False):
self.rebuilder_class = self.atomic_rebuilder_class
IndexEntry.add_generic_relations()
def get_config(self):
return self.params.get('SEARCH_CONFIG')
def get_index_for_model(self, model, db_alias=None):
return Index(self, model, db_alias)
@ -370,7 +304,7 @@ class PostgresSearchBackend(BaseSearchBackend):
self.get_index_for_object(obj_list[0]).add_items(model, obj_list)
def delete(self, obj):
IndexEntry._default_manager.for_object(obj).delete()
obj.index_entries.all().delete()
SearchBackend = PostgresSearchBackend

Wyświetl plik

@ -44,5 +44,11 @@ class Migration(migrations.Migration):
'CREATE INDEX {0}_body_search ON {0} '
'USING GIN(body_search);'.format(table),
'DROP INDEX {}_body_search;'.format(table),
state_operations=[migrations.AddIndex(
model_name='indexentry',
index=django.contrib.postgres.indexes.GinIndex(
fields=['body_search'],
name='postgres_se_body_se_70ba1a_gin'),
)],
),
]

Wyświetl plik

@ -1,50 +1,43 @@
from __future__ import absolute_import, unicode_literals
from django.contrib.contenttypes.fields import GenericForeignKey
from django.apps import apps
from django.contrib.contenttypes.fields import GenericForeignKey, GenericRelation
from django.contrib.contenttypes.models import ContentType
from django.contrib.postgres.search import SearchRank, SearchVectorField
from django.db.models import (
CASCADE, AutoField, BigAutoField, BigIntegerField, F, ForeignKey, IntegerField, Model, QuerySet,
TextField)
from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import SearchVectorField
from django.db.models import CASCADE, ForeignKey, Model, TextField
from django.db.models.functions import Cast
from django.utils.encoding import force_text, python_2_unicode_compatible
from django.utils.encoding import python_2_unicode_compatible
from django.utils.translation import ugettext_lazy as _
from .utils import WEIGHTS_VALUES, get_descendants_content_types_pks
from ...wagtailsearch.index import class_is_indexed
from .utils import get_descendants_content_types_pks
class IndexQuerySet(QuerySet):
def for_models(self, *models):
if not models:
return self.none()
return self.filter(
content_type_id__in=get_descendants_content_types_pks(models,
self._db))
class TextIDGenericRelation(GenericRelation):
def get_content_type_lookup(self, alias, remote_alias):
field = self.remote_field.model._meta.get_field(
self.content_type_field_name)
return field.get_lookup('in')(
field.get_col(remote_alias),
get_descendants_content_types_pks(self.model))
def for_object(self, obj):
db_alias = obj._state.db
return (self.using(db_alias).for_models(obj._meta.model)
.filter(object_id=force_text(obj.pk)))
def get_object_id_lookup(self, alias, remote_alias):
from_field = self.remote_field.model._meta.get_field(
self.object_id_field_name)
to_field = self.model._meta.pk
return from_field.get_lookup('exact')(
from_field.get_col(remote_alias),
Cast(to_field.get_col(alias), from_field))
def add_rank(self, search_query):
return self.annotate(
rank=SearchRank(
F('body_search'), search_query,
weights='{' + ','.join(map(str, WEIGHTS_VALUES)) + '}'))
def get_extra_restriction(self, where_class, alias, remote_alias):
cond = where_class()
cond.add(self.get_content_type_lookup(alias, remote_alias), 'AND')
cond.add(self.get_object_id_lookup(alias, remote_alias), 'AND')
return cond
def rank(self, search_query):
return self.add_rank(search_query).order_by('-rank')
def annotate_typed_pk(self):
cast_field = self.model._meta.pk
if isinstance(cast_field, BigAutoField):
cast_field = BigIntegerField()
elif isinstance(cast_field, AutoField):
cast_field = IntegerField()
return self.annotate(typed_pk=Cast('object_id', cast_field))
def pks(self):
return self.annotate_typed_pk().values_list('typed_pk', flat=True)
def resolve_related_fields(self):
return []
@python_2_unicode_compatible
@ -57,13 +50,11 @@ class IndexEntry(Model):
# TODO: Add per-object boosting.
body_search = SearchVectorField()
objects = IndexQuerySet.as_manager()
class Meta:
unique_together = ('content_type', 'object_id')
verbose_name = _('index entry')
verbose_name_plural = _('index entries')
# TODO: Move here the GIN index from the migration.
indexes = [GinIndex(['body_search'])]
def __str__(self):
return '%s: %s' % (self.content_type.name, self.content_object)
@ -71,3 +62,10 @@ class IndexEntry(Model):
@property
def model(self):
return self.content_type.model
@classmethod
def add_generic_relations(cls):
for model in apps.get_models():
if class_is_indexed(model):
TextIDGenericRelation(cls).contribute_to_class(model,
'index_entries')

Wyświetl plik

@ -60,17 +60,17 @@ def get_descendant_models(model):
return descendant_models
def get_descendants_content_types_pks(models, db_alias):
return get_content_types_pks(
tuple(descendant_model for model in models
for descendant_model in get_descendant_models(model)), db_alias)
def get_descendants_content_types_pks(model):
from django.contrib.contenttypes.models import ContentType
return [ct.pk for ct in
ContentType.objects.get_for_models(*get_descendant_models(model))
.values()]
def get_content_types_pks(models, db_alias):
def get_content_types_pk(model):
# We import it locally because this file is loaded before apps are ready.
from django.contrib.contenttypes.models import ContentType
content_types_dict = ContentType.objects.db_manager(db_alias).get_for_models(*models)
return [ct.pk for ct in content_types_dict.values()]
return ContentType.objects.get_for_model(model).pk
def get_search_fields(search_fields):