diff --git a/datasette/facets.py b/datasette/facets.py index ccd85461..f49575d9 100644 --- a/datasette/facets.py +++ b/datasette/facets.py @@ -65,6 +65,8 @@ def register_facet_classes(): class Facet: type = None + # How many rows to consider when suggesting facets: + suggest_consider = 1000 def __init__( self, @@ -145,17 +147,6 @@ class Facet: ) ).columns - async def get_row_count(self): - if self.row_count is None: - self.row_count = ( - await self.ds.execute( - self.database, - f"select count(*) from ({self.sql})", - self.params, - ) - ).rows[0][0] - return self.row_count - class ColumnFacet(Facet): type = "column" @@ -170,13 +161,16 @@ class ColumnFacet(Facet): if column in already_enabled: continue suggested_facet_sql = """ - select {column} as value, count(*) as n from ( - {sql} - ) where value is not null + with limited as (select * from ({sql}) limit {suggest_consider}) + select {column} as value, count(*) as n from limited + where value is not null group by value limit {limit} """.format( - column=escape_sqlite(column), sql=self.sql, limit=facet_size + 1 + column=escape_sqlite(column), + sql=self.sql, + limit=facet_size + 1, + suggest_consider=self.suggest_consider, ) distinct_values = None try: @@ -211,6 +205,17 @@ class ColumnFacet(Facet): continue return suggested_facets + async def get_row_count(self): + if self.row_count is None: + self.row_count = ( + await self.ds.execute( + self.database, + f"select count(*) from (select * from ({self.sql}) limit {self.suggest_consider})", + self.params, + ) + ).rows[0][0] + return self.row_count + async def facet_results(self): facet_results = [] facets_timed_out = [] @@ -313,11 +318,14 @@ class ArrayFacet(Facet): continue # Is every value in this column either null or a JSON array? suggested_facet_sql = """ + with limited as (select * from ({sql}) limit {suggest_consider}) select distinct json_type({column}) - from ({sql}) + from limited where {column} is not null and {column} != '' """.format( - column=escape_sqlite(column), sql=self.sql + column=escape_sqlite(column), + sql=self.sql, + suggest_consider=self.suggest_consider, ) try: results = await self.ds.execute( @@ -402,7 +410,9 @@ class ArrayFacet(Facet): order by count(*) desc, value limit {limit} """.format( - col=escape_sqlite(column), sql=self.sql, limit=facet_size + 1 + col=escape_sqlite(column), + sql=self.sql, + limit=facet_size + 1, ) try: facet_rows_results = await self.ds.execute( diff --git a/tests/test_facets.py b/tests/test_facets.py index 023efcf0..a2b505ec 100644 --- a/tests/test_facets.py +++ b/tests/test_facets.py @@ -1,6 +1,6 @@ from datasette.app import Datasette from datasette.database import Database -from datasette.facets import ColumnFacet, ArrayFacet, DateFacet +from datasette.facets import Facet, ColumnFacet, ArrayFacet, DateFacet from datasette.utils.asgi import Request from datasette.utils import detect_json1 from .fixtures import make_app_client @@ -662,3 +662,39 @@ async def test_facet_against_in_memory_database(): assert response1.status_code == 200 response2 = await ds.client.get("/mem/t?_facet=name&_facet=name2") assert response2.status_code == 200 + + +@pytest.mark.asyncio +async def test_facet_only_considers_first_x_rows(): + # This test works by manually fiddling with Facet.suggest_consider + ds = Datasette() + original_suggest_consider = Facet.suggest_consider + try: + Facet.suggest_consider = 40 + db = ds.add_memory_database("test_facet_only_x_rows") + await db.execute_write("create table t (id integer primary key, col text)") + # First 50 rows make it look like col and col_json should be faceted + to_insert = [{"col": "one" if i % 2 else "two"} for i in range(50)] + await db.execute_write_many("insert into t (col) values (:col)", to_insert) + # Next 50 break that assumption + to_insert2 = [{"col": f"x{i}"} for i in range(50)] + await db.execute_write_many("insert into t (col) values (:col)", to_insert2) + response = await ds.client.get( + "/test_facet_only_x_rows/t.json?_extra=suggested_facets" + ) + data = response.json() + assert data["suggested_facets"] == [ + { + "name": "col", + "toggle_url": "http://localhost/test_facet_only_x_rows/t.json?_extra=suggested_facets&_facet=col", + } + ] + # But if we set suggest_consider to 100 they are not suggested + Facet.suggest_consider = 100 + response2 = await ds.client.get( + "/test_facet_only_x_rows/t.json?_extra=suggested_facets" + ) + data2 = response2.json() + assert data2["suggested_facets"] == [] + finally: + Facet.suggest_consider = original_suggest_consider