From 6f41c8a2bef309a66588b2875c3e24d26adb4850 Mon Sep 17 00:00:00 2001 From: Simon Willison Date: Thu, 18 Feb 2021 14:09:12 -0800 Subject: [PATCH] --crossdb option for joining across databases (#1232) * Test for cross-database join, refs #283 * Warn if --crossdb used with more than 10 DBs, refs #283 * latest.datasette.io demo of --crossdb joins, refs #283 * Show attached databases on /_memory page, refs #283 * Documentation for cross-database queries, refs #283 --- .github/workflows/deploy-latest.yml | 6 +-- datasette/app.py | 20 +++++++- datasette/cli.py | 21 +++++++- datasette/database.py | 11 ++++- datasette/templates/database.html | 11 +++++ datasette/views/database.py | 3 ++ docs/datasette-serve-help.txt | 1 + docs/internals.rst | 3 ++ docs/sql_queries.rst | 31 ++++++++++++ tests/fixtures.py | 29 ++++++++++- tests/test_cli.py | 1 + tests/test_crossdb.py | 75 +++++++++++++++++++++++++++++ tests/test_internals_database.py | 11 ++++- 13 files changed, 215 insertions(+), 8 deletions(-) create mode 100644 tests/test_crossdb.py diff --git a/.github/workflows/deploy-latest.yml b/.github/workflows/deploy-latest.yml index 2de0a8b6..43e46fb4 100644 --- a/.github/workflows/deploy-latest.yml +++ b/.github/workflows/deploy-latest.yml @@ -31,7 +31,7 @@ jobs: - name: Run tests run: pytest - name: Build fixtures.db - run: python tests/fixtures.py fixtures.db fixtures.json plugins + run: python tests/fixtures.py fixtures.db fixtures.json plugins --extra-db-filename extra_database.db - name: Build docs.db run: |- cd docs @@ -48,12 +48,12 @@ jobs: run: |- gcloud config set run/region us-central1 gcloud config set project datasette-222320 - datasette publish cloudrun fixtures.db \ + datasette publish cloudrun fixtures.db extra_database.db \ -m fixtures.json \ --plugins-dir=plugins \ --branch=$GITHUB_SHA \ --version-note=$GITHUB_SHA \ - --extra-options="--setting template_debug 1" \ + --extra-options="--setting template_debug 1 --crossdb" \ --install=pysqlite3-binary \ --service=datasette-latest # Deploy docs.db to a different service diff --git a/datasette/app.py b/datasette/app.py index 9e15a162..e3272c6e 100644 --- a/datasette/app.py +++ b/datasette/app.py @@ -85,6 +85,9 @@ from .version import __version__ app_root = Path(__file__).parent.parent +# https://github.com/simonw/datasette/issues/283#issuecomment-781591015 +SQLITE_LIMIT_ATTACHED = 10 + Setting = collections.namedtuple("Setting", ("name", "default", "help")) SETTINGS = ( Setting("default_page_size", 100, "Default page size for the table view"), @@ -194,6 +197,7 @@ class Datasette: version_note=None, config_dir=None, pdb=False, + crossdb=False, ): assert config_dir is None or isinstance( config_dir, Path @@ -217,7 +221,8 @@ class Datasette: self.inspect_data = inspect_data self.immutables = set(immutables or []) self.databases = collections.OrderedDict() - if memory or not self.files: + self.crossdb = crossdb + if memory or crossdb or not self.files: self.add_database(Database(self, is_memory=True), name="_memory") # memory_name is a random string so that each Datasette instance gets its own # unique in-memory named database - otherwise unit tests can fail with weird @@ -499,6 +504,19 @@ class Datasette: conn.execute(f"PRAGMA cache_size=-{self.setting('cache_size_kb')}") # pylint: disable=no-member pm.hook.prepare_connection(conn=conn, database=database, datasette=self) + # If self.crossdb and this is _memory, connect the first SQLITE_LIMIT_ATTACHED databases + if self.crossdb and database == "_memory": + count = 0 + for db_name, db in self.databases.items(): + if count >= SQLITE_LIMIT_ATTACHED or db.is_memory: + continue + sql = 'ATTACH DATABASE "file:{path}?{qs}" AS [{name}];'.format( + path=db.path, + qs="mode=ro" if db.is_mutable else "immutable=1", + name=db_name, + ) + conn.execute(sql) + count += 1 def add_message(self, request, message, type=INFO): if not hasattr(request, "_messages"): diff --git a/datasette/cli.py b/datasette/cli.py index 815f9718..96a41740 100644 --- a/datasette/cli.py +++ b/datasette/cli.py @@ -12,7 +12,7 @@ from subprocess import call import sys from runpy import run_module import webbrowser -from .app import Datasette, DEFAULT_SETTINGS, SETTINGS, pm +from .app import Datasette, DEFAULT_SETTINGS, SETTINGS, SQLITE_LIMIT_ATTACHED, pm from .utils import ( StartupError, check_connection, @@ -410,6 +410,11 @@ def uninstall(packages, yes): is_flag=True, help="Create database files if they do not exist", ) +@click.option( + "--crossdb", + is_flag=True, + help="Enable cross-database joins using the /_memory database", +) @click.option( "--ssl-keyfile", help="SSL key file", @@ -442,6 +447,7 @@ def serve( pdb, open_browser, create, + crossdb, ssl_keyfile, ssl_certfile, return_instance=False, @@ -499,6 +505,7 @@ def serve( secret=secret, version_note=version_note, pdb=pdb, + crossdb=crossdb, ) # if files is a single directory, use that as config_dir= @@ -591,3 +598,15 @@ async def check_databases(ds): raise click.UsageError( f"Connection to {database.path} failed check: {str(e.args[0])}" ) + # If --crossdb and more than SQLITE_LIMIT_ATTACHED show warning + if ( + ds.crossdb + and len([db for db in ds.databases.values() if not db.is_memory]) + > SQLITE_LIMIT_ATTACHED + ): + msg = ( + "Warning: --crossdb only works with the first {} attached databases".format( + SQLITE_LIMIT_ATTACHED + ) + ) + click.echo(click.style(msg, bold=True, fg="yellow"), err=True) diff --git a/datasette/database.py b/datasette/database.py index cda36e6e..3579cce9 100644 --- a/datasette/database.py +++ b/datasette/database.py @@ -1,4 +1,5 @@ import asyncio +from collections import namedtuple from pathlib import Path import janus import queue @@ -22,6 +23,8 @@ from .inspect import inspect_hash connections = threading.local() +AttachedDatabase = namedtuple("AttachedDatabase", ("seq", "name", "file")) + class Database: def __init__( @@ -78,7 +81,7 @@ class Database: conn.execute("PRAGMA query_only=1") return conn if self.is_memory: - return sqlite3.connect(":memory:") + return sqlite3.connect(":memory:", uri=True) # mode=ro or immutable=1? if self.is_mutable: qs = "?mode=ro" @@ -243,6 +246,12 @@ class Database: return None return Path(self.path).stat().st_mtime_ns + async def attached_databases(self): + results = await self.execute( + "select seq, name, file from pragma_database_list() where seq > 0" + ) + return [AttachedDatabase(*row) for row in results.rows] + async def table_exists(self, table): results = await self.execute( "select 1 from sqlite_master where type='table' and name=?", params=(table,) diff --git a/datasette/templates/database.html b/datasette/templates/database.html index 7065f2c2..3fe7c891 100644 --- a/datasette/templates/database.html +++ b/datasette/templates/database.html @@ -56,6 +56,17 @@ {% endif %} +{% if attached_databases %} +
+

The following databases are attached to this connection, and can be used for cross-database joins:

+ +
+{% endif %} + {% for table in tables %} {% if show_hidden or not table.hidden %}
diff --git a/datasette/views/database.py b/datasette/views/database.py index 75eb8f02..0c58a351 100644 --- a/datasette/views/database.py +++ b/datasette/views/database.py @@ -115,6 +115,8 @@ class DatabaseView(DataView): links.extend(extra_links) return links + attached_databases = [d.name for d in await db.attached_databases()] + return ( { "database": database, @@ -139,6 +141,7 @@ class DatabaseView(DataView): "allow_download": self.ds.setting("allow_download") and not db.is_mutable and not db.is_memory, + "attached_databases": attached_databases, }, (f"database-{to_css_class(database)}.html", "database.html"), ) diff --git a/docs/datasette-serve-help.txt b/docs/datasette-serve-help.txt index 243637cb..f0dab3ea 100644 --- a/docs/datasette-serve-help.txt +++ b/docs/datasette-serve-help.txt @@ -41,6 +41,7 @@ Options: --pdb Launch debugger on any errors -o, --open Open Datasette in your web browser --create Create database files if they do not exist + --crossdb Enable cross-database joins using the /_memory database --ssl-keyfile TEXT SSL key file --ssl-certfile TEXT SSL certificate file --help Show this message and exit. diff --git a/docs/internals.rst b/docs/internals.rst index 4a2c0a8e..a46fe0f5 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -677,6 +677,9 @@ The ``Database`` class also provides properties and methods for introspecting th ``db.is_memory`` - boolean Is this database an in-memory database? +``await db.attached_databases()`` - list of named tuples + Returns a list of additional databases that have been connected to this database using the SQLite ATTACH command. Each named tuple has fields ``seq``, ``name`` and ``file``. + ``await db.table_exists(table)`` - boolean Check if a table called ``table`` exists. diff --git a/docs/sql_queries.rst b/docs/sql_queries.rst index 93f17eaf..bb263b18 100644 --- a/docs/sql_queries.rst +++ b/docs/sql_queries.rst @@ -389,3 +389,34 @@ detect if there should be another page. Since the where clause acts against the index on the primary key, the query is extremely fast even for records that are a long way into the overall pagination set. + +.. _cross_database_quereies: + +Cross-database queries +---------------------- + +SQLite has the ability to run queries that join across multiple databases. Up to ten databases can be attached to a single SQLite connection and queried together. + +Datasette can execute joins across multiple databases if it is started with the ``--crossdb`` option:: + + datasette fixtures.db extra_database.db --crossdb + +If it is started in this way, the ``/_memory`` page can be used to execute queries that join across multiple databases. + +References to tables in attached databases should be preceeded by the database name and a period. + +For example, this query will show a list of tables across both of the above databases: + +.. code-block:: sql + + select + 'fixtures' as database, * + from + [fixtures].sqlite_master + union + select + 'extra_database' as database, * + from + [extra_database].sqlite_master + +`Try that out here `__. diff --git a/tests/fixtures.py b/tests/fixtures.py index 1ec6a2ba..30113ff2 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -105,6 +105,7 @@ def make_app_client( static_mounts=None, template_dir=None, metadata=None, + crossdb=False, ): with tempfile.TemporaryDirectory() as tmpdir: filepath = os.path.join(tmpdir, filename) @@ -149,6 +150,7 @@ def make_app_client( inspect_data=inspect_data, static_mounts=static_mounts, template_dir=template_dir, + crossdb=crossdb, ) ds.sqlite_functions.append(("sleep", 1, lambda n: time.sleep(float(n)))) yield TestClient(ds) @@ -180,6 +182,15 @@ def app_client_two_attached_databases(): yield client +@pytest.fixture(scope="session") +def app_client_two_attached_databases_crossdb_enabled(): + with make_app_client( + extra_databases={"extra database.db": EXTRA_DATABASE_SQL}, + crossdb=True, + ) as client: + yield client + + @pytest.fixture(scope="session") def app_client_conflicting_database_names(): with make_app_client( @@ -750,7 +761,12 @@ def assert_permissions_checked(datasette, actions): default=False, help="Delete and recreate database if it exists", ) -def cli(db_filename, metadata, plugins_path, recreate): +@click.option( + "--extra-db-filename", + type=click.Path(file_okay=True, dir_okay=False), + help="Write out second test DB to this file", +) +def cli(db_filename, metadata, plugins_path, recreate, extra_db_filename): """Write out the fixtures database used by Datasette's test suite""" if metadata and not metadata.endswith(".json"): raise click.ClickException("Metadata should end with .json") @@ -784,6 +800,17 @@ def cli(db_filename, metadata, plugins_path, recreate): newpath = path / filepath.name newpath.write_text(filepath.open().read()) print(f" Wrote plugin: {newpath}") + if extra_db_filename: + if pathlib.Path(extra_db_filename).exists(): + if not recreate: + raise click.ClickException( + f"{extra_db_filename} already exists, use --recreate to reset it" + ) + else: + pathlib.Path(extra_db_filename).unlink() + conn = sqlite3.connect(extra_db_filename) + conn.executescript(EXTRA_DATABASE_SQL) + print(f"Test tables written to {extra_db_filename}") if __name__ == "__main__": diff --git a/tests/test_cli.py b/tests/test_cli.py index 4bcf615b..8ddd32f6 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -147,6 +147,7 @@ def test_metadata_yaml(): get=None, help_config=False, pdb=False, + crossdb=False, open_browser=False, create=False, ssl_keyfile=None, diff --git a/tests/test_crossdb.py b/tests/test_crossdb.py new file mode 100644 index 00000000..01c51130 --- /dev/null +++ b/tests/test_crossdb.py @@ -0,0 +1,75 @@ +from datasette.cli import cli +from click.testing import CliRunner +import urllib +import sqlite3 +from .fixtures import app_client_two_attached_databases_crossdb_enabled + + +def test_crossdb_join(app_client_two_attached_databases_crossdb_enabled): + app_client = app_client_two_attached_databases_crossdb_enabled + sql = """ + select + 'extra database' as db, + pk, + text1, + text2 + from + [extra database].searchable + union all + select + 'fixtures' as db, + pk, + text1, + text2 + from + fixtures.searchable + """ + response = app_client.get( + "/_memory.json?" + urllib.parse.urlencode({"sql": sql, "_shape": "array"}) + ) + assert response.status == 200 + assert response.json == [ + {"db": "extra database", "pk": 1, "text1": "barry cat", "text2": "terry dog"}, + {"db": "extra database", "pk": 2, "text1": "terry dog", "text2": "sara weasel"}, + {"db": "fixtures", "pk": 1, "text1": "barry cat", "text2": "terry dog"}, + {"db": "fixtures", "pk": 2, "text1": "terry dog", "text2": "sara weasel"}, + ] + + +def test_crossdb_warning_if_too_many_databases(tmp_path_factory): + db_dir = tmp_path_factory.mktemp("dbs") + dbs = [] + for i in range(11): + path = str(db_dir / "db_{}.db".format(i)) + conn = sqlite3.connect(path) + conn.execute("vacuum") + dbs.append(path) + runner = CliRunner(mix_stderr=False) + result = runner.invoke( + cli, + [ + "serve", + "--crossdb", + "--get", + "/", + ] + + dbs, + catch_exceptions=False, + ) + assert ( + "Warning: --crossdb only works with the first 10 attached databases" + in result.stderr + ) + + +def test_crossdb_attached_database_list_display( + app_client_two_attached_databases_crossdb_enabled, +): + app_client = app_client_two_attached_databases_crossdb_enabled + response = app_client.get("/_memory") + for fragment in ( + "databases are attached to this connection", + "
  • fixtures - ", + "
  • extra database - ", + ): + assert fragment in response.text diff --git a/tests/test_internals_database.py b/tests/test_internals_database.py index 7eff9f7e..086f1a48 100644 --- a/tests/test_internals_database.py +++ b/tests/test_internals_database.py @@ -4,7 +4,7 @@ Tests for the datasette.database.Database class from datasette.database import Database, Results, MultipleValues from datasette.utils.sqlite import sqlite3, supports_generated_columns from datasette.utils import Column -from .fixtures import app_client +from .fixtures import app_client, app_client_two_attached_databases_crossdb_enabled import pytest import time import uuid @@ -466,6 +466,15 @@ def test_is_mutable(app_client): assert Database(app_client.ds, is_memory=True, is_mutable=False).is_mutable is False +@pytest.mark.asyncio +async def test_attached_databases(app_client_two_attached_databases_crossdb_enabled): + database = app_client_two_attached_databases_crossdb_enabled.ds.get_database( + "_memory" + ) + attached = await database.attached_databases() + assert {a.name for a in attached} == {"extra database", "fixtures"} + + @pytest.mark.asyncio async def test_database_memory_name(app_client): ds = app_client.ds