From ea66c45df96479ef66a89caa71fff1a97a862646 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Thu, 2 May 2019 17:11:26 -0700 Subject: [PATCH] Extract facet code out into a new plugin hook, closes #427 (#445) Datasette previously only supported one type of faceting: exact column value counting. With this change, faceting logic is extracted out into one or more separate classes which can implement other patterns of faceting - this is discussed in #427, but potential upcoming facet types include facet-by-date, facet-by-JSON-array, facet-by-many-2-many and more. A new plugin hook, register_facet_classes, can be used by plugins to add in additional facet classes. Each class must implement two methods: suggest(), which scans columns in the table to decide if they might be worth suggesting for faceting, and facet_results(), which executes the facet operation and returns results ready to be displayed in the UI. --- datasette/app.py | 12 +- datasette/facets.py | 251 +++++++++++++++++++++++++++++++++ datasette/hookspecs.py | 5 + datasette/plugins.py | 6 +- datasette/templates/table.html | 8 +- datasette/views/table.py | 168 +++++++--------------- docs/plugins.rst | 65 +++++++++ tests/test_api.py | 33 ++++- tests/test_facets.py | 174 +++++++++++++++++++++++ tests/utils.py | 8 ++ 10 files changed, 599 insertions(+), 131 deletions(-) create mode 100644 datasette/facets.py create mode 100644 tests/test_facets.py create mode 100644 tests/utils.py diff --git a/datasette/app.py b/datasette/app.py index 282c161a..ac475bb4 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -676,6 +676,7 @@ class Datasette: truncate=False, custom_time_limit=None, page_size=None, + log_sql_errors=True, ): """Executes sql against db_name in a thread""" page_size = page_size or self.page_size @@ -701,12 +702,13 @@ class Datasette: truncated = False except sqlite3.OperationalError as e: if e.args == ('interrupted',): - raise InterruptedError(e) - print( - "ERROR: conn={}, sql = {}, params = {}: {}".format( - conn, repr(sql), params, e + raise InterruptedError(e, sql, params) + if log_sql_errors: + print( + "ERROR: conn={}, sql = {}, params = {}: {}".format( + conn, repr(sql), params, e + ) ) - ) raise if truncate: diff --git a/datasette/facets.py b/datasette/facets.py new file mode 100644 index 00000000..9e3b3044 --- /dev/null +++ b/datasette/facets.py @@ -0,0 +1,251 @@ +import json +import urllib +import re +from datasette import hookimpl +from datasette.utils import ( + escape_sqlite, + get_all_foreign_keys, + path_with_added_args, + path_with_removed_args, + detect_json1, + InterruptedError, + InvalidSql, + sqlite3, +) + + +def load_facet_configs(request, table_metadata): + # Given a request and the metadata configuration for a table, return + # a dictionary of selected facets, their lists of configs and for each + # config whether it came from the request or the metadata. + # + # return {type: [ + # {"source": "metadata", "config": config1}, + # {"source": "request", "config": config2}]} + facet_configs = {} + table_metadata = table_metadata or {} + metadata_facets = table_metadata.get("facets", []) + for metadata_config in metadata_facets: + if isinstance(metadata_config, str): + type = "column" + metadata_config = {"simple": metadata_config} + else: + # This should have a single key and a single value + assert len(metadata_config.values()) == 1, "Metadata config dicts should be {type: config}" + type, metadata_config = metadata_config.items()[0] + if isinstance(metadata_config, str): + metadata_config = {"simple": metadata_config} + facet_configs.setdefault(type, []).append({ + "source": "metadata", + "config": metadata_config + }) + qs_pairs = urllib.parse.parse_qs(request.query_string, keep_blank_values=True) + for key, values in qs_pairs.items(): + if key.startswith("_facet"): + # Figure out the facet type + if key == "_facet": + type = "column" + elif key.startswith("_facet_"): + type = key[len("_facet_") :] + for value in values: + # The value is the config - either JSON or not + if value.startswith("{"): + config = json.loads(value) + else: + config = {"simple": value} + facet_configs.setdefault(type, []).append({ + "source": "request", + "config": config + }) + return facet_configs + + +@hookimpl +def register_facet_classes(): + return [ColumnFacet] + + +class Facet: + type = None + + def __init__( + self, + ds, + request, + database, + sql=None, + table=None, + params=None, + metadata=None, + row_count=None, + ): + assert table or sql, "Must provide either table= or sql=" + self.ds = ds + self.request = request + self.database = database + # For foreign key expansion. Can be None for e.g. canned SQL queries: + self.table = table + self.sql = sql or "select * from [{}]".format(table) + self.params = params or [] + self.metadata = metadata + # row_count can be None, in which case we calculate it ourselves: + self.row_count = row_count + + def get_configs(self): + configs = load_facet_configs(self.request, self.metadata) + return configs.get(self.type) or [] + + def get_querystring_pairs(self): + # ?_foo=bar&_foo=2&empty= becomes: + # [('_foo', 'bar'), ('_foo', '2'), ('empty', '')] + return urllib.parse.parse_qsl(self.request.query_string, keep_blank_values=True) + + async def suggest(self): + return [] + + async def facet_results(self): + # returns ([results], [timed_out]) + # TODO: Include "hideable" with each one somehow, which indicates if it was + # defined in metadata (in which case you cannot turn it off) + raise NotImplementedError + + async def get_columns(self, sql, params=None): + # Detect column names using the "limit 0" trick + return ( + await self.ds.execute( + self.database, "select * from ({}) limit 0".format(sql), params or [] + ) + ).columns + + async def get_row_count(self): + if self.row_count is None: + self.row_count = ( + await self.ds.execute( + self.database, + "select count(*) from ({})".format(self.sql), + self.params, + ) + ).rows[0][0] + return self.row_count + + +class ColumnFacet(Facet): + type = "column" + + async def suggest(self): + row_count = await self.get_row_count() + columns = await self.get_columns(self.sql, self.params) + facet_size = self.ds.config("default_facet_size") + suggested_facets = [] + already_enabled = [c["config"]["simple"] for c in self.get_configs()] + for column in columns: + if column in already_enabled: + continue + suggested_facet_sql = """ + select distinct {column} from ( + {sql} + ) where {column} is not null + limit {limit} + """.format( + column=escape_sqlite(column), sql=self.sql, limit=facet_size + 1 + ) + distinct_values = None + try: + distinct_values = await self.ds.execute( + self.database, + suggested_facet_sql, + self.params, + truncate=False, + custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"), + ) + num_distinct_values = len(distinct_values) + if ( + num_distinct_values + and num_distinct_values > 1 + and num_distinct_values <= facet_size + and num_distinct_values < row_count + ): + suggested_facets.append( + { + "name": column, + "toggle_url": self.ds.absolute_url( + self.request, + path_with_added_args(self.request, {"_facet": column}), + ), + } + ) + except InterruptedError: + continue + return suggested_facets + + async def facet_results(self): + facet_results = {} + facets_timed_out = [] + + qs_pairs = self.get_querystring_pairs() + + facet_size = self.ds.config("default_facet_size") + for source_and_config in self.get_configs(): + config = source_and_config["config"] + source = source_and_config["source"] + column = config.get("column") or config["simple"] + facet_sql = """ + select {col} as value, count(*) as count from ( + {sql} + ) + where {col} is not null + group by {col} order by count desc limit {limit} + """.format( + col=escape_sqlite(column), sql=self.sql, limit=facet_size + 1 + ) + try: + facet_rows_results = await self.ds.execute( + self.database, + facet_sql, + self.params, + truncate=False, + custom_time_limit=self.ds.config("facet_time_limit_ms"), + ) + facet_results_values = [] + facet_results[column] = { + "name": column, + "type": self.type, + "hideable": source != "metadata", + "toggle_url": path_with_removed_args(self.request, {"_facet": column}), + "results": facet_results_values, + "truncated": len(facet_rows_results) > facet_size, + } + facet_rows = facet_rows_results.rows[:facet_size] + if self.table: + # Attempt to expand foreign keys into labels + values = [row["value"] for row in facet_rows] + expanded = await self.ds.expand_foreign_keys( + self.database, self.table, column, values + ) + else: + expanded = {} + for row in facet_rows: + selected = (column, str(row["value"])) in qs_pairs + if selected: + toggle_path = path_with_removed_args( + self.request, {column: str(row["value"])} + ) + else: + toggle_path = path_with_added_args( + self.request, {column: row["value"]} + ) + facet_results_values.append( + { + "value": row["value"], + "label": expanded.get((column, row["value"]), row["value"]), + "count": row["count"], + "toggle_url": self.ds.absolute_url( + self.request, toggle_path + ), + "selected": selected, + } + ) + except InterruptedError: + facets_timed_out.append(column) + + return facet_results, facets_timed_out diff --git a/datasette/hookspecs.py b/datasette/hookspecs.py index 576e05b9..4dc6338e 100644 --- a/datasette/hookspecs.py +++ b/datasette/hookspecs.py @@ -43,3 +43,8 @@ def render_cell(value, column, table, database, datasette): @hookspec def register_output_renderer(datasette): "Register a renderer to output data in a different format" + + +@hookspec +def register_facet_classes(): + "Register Facet subclasses" diff --git a/datasette/plugins.py b/datasette/plugins.py index 8981ace7..4e038923 100644 --- a/datasette/plugins.py +++ b/datasette/plugins.py @@ -3,7 +3,11 @@ import pluggy import sys from . import hookspecs -DEFAULT_PLUGINS = ("datasette.publish.heroku", "datasette.publish.now") +DEFAULT_PLUGINS = ( + "datasette.publish.heroku", + "datasette.publish.now", + "datasette.facets", +) pm = pluggy.PluginManager("datasette") pm.add_hookspecs(hookspecs) diff --git a/datasette/templates/table.html b/datasette/templates/table.html index d28f41d6..5ba3ff6d 100644 --- a/datasette/templates/table.html +++ b/datasette/templates/table.html @@ -110,7 +110,7 @@ {% if suggested_facets %}

- Suggested facets: {% for facet in suggested_facets %}{{ facet.name }}{% if not loop.last %}, {% endif %}{% endfor %} + Suggested facets: {% for facet in suggested_facets %}{{ facet.name }}{% if facet.type %} ({{ facet.type }}){% endif %}{% if not loop.last %}, {% endif %}{% endfor %}

{% endif %} @@ -123,9 +123,9 @@ {% for facet_info in sorted_facet_results %}

- {{ facet_info.name }} - {% if facet_hideable(facet_info.name) %} - + {{ facet_info.name }}{% if facet_info.type != "column" %} ({{ facet_info.type }}){% endif %} + {% if facet_info.hideable %} + {% endif %}