--crossdb option for joining across databases (#1232)

* Test for cross-database join, refs #283
* Warn if --crossdb used with more than 10 DBs, refs #283
* latest.datasette.io demo of --crossdb joins, refs #283
* Show attached databases on /_memory page, refs #283
* Documentation for cross-database queries, refs #283
pull/1243/head
Simon Willison 2021-02-18 14:09:12 -08:00 zatwierdzone przez GitHub
rodzic 4df548e766
commit 6f41c8a2be
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
13 zmienionych plików z 215 dodań i 8 usunięć

Wyświetl plik

@ -31,7 +31,7 @@ jobs:
- name: Run tests - name: Run tests
run: pytest run: pytest
- name: Build fixtures.db - name: Build fixtures.db
run: python tests/fixtures.py fixtures.db fixtures.json plugins run: python tests/fixtures.py fixtures.db fixtures.json plugins --extra-db-filename extra_database.db
- name: Build docs.db - name: Build docs.db
run: |- run: |-
cd docs cd docs
@ -48,12 +48,12 @@ jobs:
run: |- run: |-
gcloud config set run/region us-central1 gcloud config set run/region us-central1
gcloud config set project datasette-222320 gcloud config set project datasette-222320
datasette publish cloudrun fixtures.db \ datasette publish cloudrun fixtures.db extra_database.db \
-m fixtures.json \ -m fixtures.json \
--plugins-dir=plugins \ --plugins-dir=plugins \
--branch=$GITHUB_SHA \ --branch=$GITHUB_SHA \
--version-note=$GITHUB_SHA \ --version-note=$GITHUB_SHA \
--extra-options="--setting template_debug 1" \ --extra-options="--setting template_debug 1 --crossdb" \
--install=pysqlite3-binary \ --install=pysqlite3-binary \
--service=datasette-latest --service=datasette-latest
# Deploy docs.db to a different service # Deploy docs.db to a different service

Wyświetl plik

@ -85,6 +85,9 @@ from .version import __version__
app_root = Path(__file__).parent.parent app_root = Path(__file__).parent.parent
# https://github.com/simonw/datasette/issues/283#issuecomment-781591015
SQLITE_LIMIT_ATTACHED = 10
Setting = collections.namedtuple("Setting", ("name", "default", "help")) Setting = collections.namedtuple("Setting", ("name", "default", "help"))
SETTINGS = ( SETTINGS = (
Setting("default_page_size", 100, "Default page size for the table view"), Setting("default_page_size", 100, "Default page size for the table view"),
@ -194,6 +197,7 @@ class Datasette:
version_note=None, version_note=None,
config_dir=None, config_dir=None,
pdb=False, pdb=False,
crossdb=False,
): ):
assert config_dir is None or isinstance( assert config_dir is None or isinstance(
config_dir, Path config_dir, Path
@ -217,7 +221,8 @@ class Datasette:
self.inspect_data = inspect_data self.inspect_data = inspect_data
self.immutables = set(immutables or []) self.immutables = set(immutables or [])
self.databases = collections.OrderedDict() self.databases = collections.OrderedDict()
if memory or not self.files: self.crossdb = crossdb
if memory or crossdb or not self.files:
self.add_database(Database(self, is_memory=True), name="_memory") self.add_database(Database(self, is_memory=True), name="_memory")
# memory_name is a random string so that each Datasette instance gets its own # memory_name is a random string so that each Datasette instance gets its own
# unique in-memory named database - otherwise unit tests can fail with weird # unique in-memory named database - otherwise unit tests can fail with weird
@ -499,6 +504,19 @@ class Datasette:
conn.execute(f"PRAGMA cache_size=-{self.setting('cache_size_kb')}") conn.execute(f"PRAGMA cache_size=-{self.setting('cache_size_kb')}")
# pylint: disable=no-member # pylint: disable=no-member
pm.hook.prepare_connection(conn=conn, database=database, datasette=self) pm.hook.prepare_connection(conn=conn, database=database, datasette=self)
# If self.crossdb and this is _memory, connect the first SQLITE_LIMIT_ATTACHED databases
if self.crossdb and database == "_memory":
count = 0
for db_name, db in self.databases.items():
if count >= SQLITE_LIMIT_ATTACHED or db.is_memory:
continue
sql = 'ATTACH DATABASE "file:{path}?{qs}" AS [{name}];'.format(
path=db.path,
qs="mode=ro" if db.is_mutable else "immutable=1",
name=db_name,
)
conn.execute(sql)
count += 1
def add_message(self, request, message, type=INFO): def add_message(self, request, message, type=INFO):
if not hasattr(request, "_messages"): if not hasattr(request, "_messages"):

Wyświetl plik

@ -12,7 +12,7 @@ from subprocess import call
import sys import sys
from runpy import run_module from runpy import run_module
import webbrowser import webbrowser
from .app import Datasette, DEFAULT_SETTINGS, SETTINGS, pm from .app import Datasette, DEFAULT_SETTINGS, SETTINGS, SQLITE_LIMIT_ATTACHED, pm
from .utils import ( from .utils import (
StartupError, StartupError,
check_connection, check_connection,
@ -410,6 +410,11 @@ def uninstall(packages, yes):
is_flag=True, is_flag=True,
help="Create database files if they do not exist", help="Create database files if they do not exist",
) )
@click.option(
"--crossdb",
is_flag=True,
help="Enable cross-database joins using the /_memory database",
)
@click.option( @click.option(
"--ssl-keyfile", "--ssl-keyfile",
help="SSL key file", help="SSL key file",
@ -442,6 +447,7 @@ def serve(
pdb, pdb,
open_browser, open_browser,
create, create,
crossdb,
ssl_keyfile, ssl_keyfile,
ssl_certfile, ssl_certfile,
return_instance=False, return_instance=False,
@ -499,6 +505,7 @@ def serve(
secret=secret, secret=secret,
version_note=version_note, version_note=version_note,
pdb=pdb, pdb=pdb,
crossdb=crossdb,
) )
# if files is a single directory, use that as config_dir= # if files is a single directory, use that as config_dir=
@ -591,3 +598,15 @@ async def check_databases(ds):
raise click.UsageError( raise click.UsageError(
f"Connection to {database.path} failed check: {str(e.args[0])}" f"Connection to {database.path} failed check: {str(e.args[0])}"
) )
# If --crossdb and more than SQLITE_LIMIT_ATTACHED show warning
if (
ds.crossdb
and len([db for db in ds.databases.values() if not db.is_memory])
> SQLITE_LIMIT_ATTACHED
):
msg = (
"Warning: --crossdb only works with the first {} attached databases".format(
SQLITE_LIMIT_ATTACHED
)
)
click.echo(click.style(msg, bold=True, fg="yellow"), err=True)

Wyświetl plik

@ -1,4 +1,5 @@
import asyncio import asyncio
from collections import namedtuple
from pathlib import Path from pathlib import Path
import janus import janus
import queue import queue
@ -22,6 +23,8 @@ from .inspect import inspect_hash
connections = threading.local() connections = threading.local()
AttachedDatabase = namedtuple("AttachedDatabase", ("seq", "name", "file"))
class Database: class Database:
def __init__( def __init__(
@ -78,7 +81,7 @@ class Database:
conn.execute("PRAGMA query_only=1") conn.execute("PRAGMA query_only=1")
return conn return conn
if self.is_memory: if self.is_memory:
return sqlite3.connect(":memory:") return sqlite3.connect(":memory:", uri=True)
# mode=ro or immutable=1? # mode=ro or immutable=1?
if self.is_mutable: if self.is_mutable:
qs = "?mode=ro" qs = "?mode=ro"
@ -243,6 +246,12 @@ class Database:
return None return None
return Path(self.path).stat().st_mtime_ns return Path(self.path).stat().st_mtime_ns
async def attached_databases(self):
results = await self.execute(
"select seq, name, file from pragma_database_list() where seq > 0"
)
return [AttachedDatabase(*row) for row in results.rows]
async def table_exists(self, table): async def table_exists(self, table):
results = await self.execute( results = await self.execute(
"select 1 from sqlite_master where type='table' and name=?", params=(table,) "select 1 from sqlite_master where type='table' and name=?", params=(table,)

Wyświetl plik

@ -56,6 +56,17 @@
</form> </form>
{% endif %} {% endif %}
{% if attached_databases %}
<div class="message-info">
<p>The following databases are attached to this connection, and can be used for cross-database joins:</p>
<ul class="bullets">
{% for db_name in attached_databases %}
<li><strong>{{ db_name }}</strong> - <a href="?sql=select+*+from+[{{ db_name }}].sqlite_master+where+type='table'">tables</a></li>
{% endfor %}
</ul>
</div>
{% endif %}
{% for table in tables %} {% for table in tables %}
{% if show_hidden or not table.hidden %} {% if show_hidden or not table.hidden %}
<div class="db-table"> <div class="db-table">

Wyświetl plik

@ -115,6 +115,8 @@ class DatabaseView(DataView):
links.extend(extra_links) links.extend(extra_links)
return links return links
attached_databases = [d.name for d in await db.attached_databases()]
return ( return (
{ {
"database": database, "database": database,
@ -139,6 +141,7 @@ class DatabaseView(DataView):
"allow_download": self.ds.setting("allow_download") "allow_download": self.ds.setting("allow_download")
and not db.is_mutable and not db.is_mutable
and not db.is_memory, and not db.is_memory,
"attached_databases": attached_databases,
}, },
(f"database-{to_css_class(database)}.html", "database.html"), (f"database-{to_css_class(database)}.html", "database.html"),
) )

Wyświetl plik

@ -41,6 +41,7 @@ Options:
--pdb Launch debugger on any errors --pdb Launch debugger on any errors
-o, --open Open Datasette in your web browser -o, --open Open Datasette in your web browser
--create Create database files if they do not exist --create Create database files if they do not exist
--crossdb Enable cross-database joins using the /_memory database
--ssl-keyfile TEXT SSL key file --ssl-keyfile TEXT SSL key file
--ssl-certfile TEXT SSL certificate file --ssl-certfile TEXT SSL certificate file
--help Show this message and exit. --help Show this message and exit.

Wyświetl plik

@ -677,6 +677,9 @@ The ``Database`` class also provides properties and methods for introspecting th
``db.is_memory`` - boolean ``db.is_memory`` - boolean
Is this database an in-memory database? Is this database an in-memory database?
``await db.attached_databases()`` - list of named tuples
Returns a list of additional databases that have been connected to this database using the SQLite ATTACH command. Each named tuple has fields ``seq``, ``name`` and ``file``.
``await db.table_exists(table)`` - boolean ``await db.table_exists(table)`` - boolean
Check if a table called ``table`` exists. Check if a table called ``table`` exists.

Wyświetl plik

@ -389,3 +389,34 @@ detect if there should be another page.
Since the where clause acts against the index on the primary key, the query is Since the where clause acts against the index on the primary key, the query is
extremely fast even for records that are a long way into the overall pagination extremely fast even for records that are a long way into the overall pagination
set. set.
.. _cross_database_quereies:
Cross-database queries
----------------------
SQLite has the ability to run queries that join across multiple databases. Up to ten databases can be attached to a single SQLite connection and queried together.
Datasette can execute joins across multiple databases if it is started with the ``--crossdb`` option::
datasette fixtures.db extra_database.db --crossdb
If it is started in this way, the ``/_memory`` page can be used to execute queries that join across multiple databases.
References to tables in attached databases should be preceeded by the database name and a period.
For example, this query will show a list of tables across both of the above databases:
.. code-block:: sql
select
'fixtures' as database, *
from
[fixtures].sqlite_master
union
select
'extra_database' as database, *
from
[extra_database].sqlite_master
`Try that out here <https://latest.datasette.io/_memory?sql=select%0D%0A++%27fixtures%27+as+database%2C+*%0D%0Afrom%0D%0A++%5Bfixtures%5D.sqlite_master%0D%0Aunion%0D%0Aselect%0D%0A++%27extra_database%27+as+database%2C+*%0D%0Afrom%0D%0A++%5Bextra_database%5D.sqlite_master>`__.

Wyświetl plik

@ -105,6 +105,7 @@ def make_app_client(
static_mounts=None, static_mounts=None,
template_dir=None, template_dir=None,
metadata=None, metadata=None,
crossdb=False,
): ):
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
filepath = os.path.join(tmpdir, filename) filepath = os.path.join(tmpdir, filename)
@ -149,6 +150,7 @@ def make_app_client(
inspect_data=inspect_data, inspect_data=inspect_data,
static_mounts=static_mounts, static_mounts=static_mounts,
template_dir=template_dir, template_dir=template_dir,
crossdb=crossdb,
) )
ds.sqlite_functions.append(("sleep", 1, lambda n: time.sleep(float(n)))) ds.sqlite_functions.append(("sleep", 1, lambda n: time.sleep(float(n))))
yield TestClient(ds) yield TestClient(ds)
@ -180,6 +182,15 @@ def app_client_two_attached_databases():
yield client yield client
@pytest.fixture(scope="session")
def app_client_two_attached_databases_crossdb_enabled():
with make_app_client(
extra_databases={"extra database.db": EXTRA_DATABASE_SQL},
crossdb=True,
) as client:
yield client
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def app_client_conflicting_database_names(): def app_client_conflicting_database_names():
with make_app_client( with make_app_client(
@ -750,7 +761,12 @@ def assert_permissions_checked(datasette, actions):
default=False, default=False,
help="Delete and recreate database if it exists", help="Delete and recreate database if it exists",
) )
def cli(db_filename, metadata, plugins_path, recreate): @click.option(
"--extra-db-filename",
type=click.Path(file_okay=True, dir_okay=False),
help="Write out second test DB to this file",
)
def cli(db_filename, metadata, plugins_path, recreate, extra_db_filename):
"""Write out the fixtures database used by Datasette's test suite""" """Write out the fixtures database used by Datasette's test suite"""
if metadata and not metadata.endswith(".json"): if metadata and not metadata.endswith(".json"):
raise click.ClickException("Metadata should end with .json") raise click.ClickException("Metadata should end with .json")
@ -784,6 +800,17 @@ def cli(db_filename, metadata, plugins_path, recreate):
newpath = path / filepath.name newpath = path / filepath.name
newpath.write_text(filepath.open().read()) newpath.write_text(filepath.open().read())
print(f" Wrote plugin: {newpath}") print(f" Wrote plugin: {newpath}")
if extra_db_filename:
if pathlib.Path(extra_db_filename).exists():
if not recreate:
raise click.ClickException(
f"{extra_db_filename} already exists, use --recreate to reset it"
)
else:
pathlib.Path(extra_db_filename).unlink()
conn = sqlite3.connect(extra_db_filename)
conn.executescript(EXTRA_DATABASE_SQL)
print(f"Test tables written to {extra_db_filename}")
if __name__ == "__main__": if __name__ == "__main__":

Wyświetl plik

@ -147,6 +147,7 @@ def test_metadata_yaml():
get=None, get=None,
help_config=False, help_config=False,
pdb=False, pdb=False,
crossdb=False,
open_browser=False, open_browser=False,
create=False, create=False,
ssl_keyfile=None, ssl_keyfile=None,

Wyświetl plik

@ -0,0 +1,75 @@
from datasette.cli import cli
from click.testing import CliRunner
import urllib
import sqlite3
from .fixtures import app_client_two_attached_databases_crossdb_enabled
def test_crossdb_join(app_client_two_attached_databases_crossdb_enabled):
app_client = app_client_two_attached_databases_crossdb_enabled
sql = """
select
'extra database' as db,
pk,
text1,
text2
from
[extra database].searchable
union all
select
'fixtures' as db,
pk,
text1,
text2
from
fixtures.searchable
"""
response = app_client.get(
"/_memory.json?" + urllib.parse.urlencode({"sql": sql, "_shape": "array"})
)
assert response.status == 200
assert response.json == [
{"db": "extra database", "pk": 1, "text1": "barry cat", "text2": "terry dog"},
{"db": "extra database", "pk": 2, "text1": "terry dog", "text2": "sara weasel"},
{"db": "fixtures", "pk": 1, "text1": "barry cat", "text2": "terry dog"},
{"db": "fixtures", "pk": 2, "text1": "terry dog", "text2": "sara weasel"},
]
def test_crossdb_warning_if_too_many_databases(tmp_path_factory):
db_dir = tmp_path_factory.mktemp("dbs")
dbs = []
for i in range(11):
path = str(db_dir / "db_{}.db".format(i))
conn = sqlite3.connect(path)
conn.execute("vacuum")
dbs.append(path)
runner = CliRunner(mix_stderr=False)
result = runner.invoke(
cli,
[
"serve",
"--crossdb",
"--get",
"/",
]
+ dbs,
catch_exceptions=False,
)
assert (
"Warning: --crossdb only works with the first 10 attached databases"
in result.stderr
)
def test_crossdb_attached_database_list_display(
app_client_two_attached_databases_crossdb_enabled,
):
app_client = app_client_two_attached_databases_crossdb_enabled
response = app_client.get("/_memory")
for fragment in (
"databases are attached to this connection",
"<li><strong>fixtures</strong> - ",
"<li><strong>extra database</strong> - ",
):
assert fragment in response.text

Wyświetl plik

@ -4,7 +4,7 @@ Tests for the datasette.database.Database class
from datasette.database import Database, Results, MultipleValues from datasette.database import Database, Results, MultipleValues
from datasette.utils.sqlite import sqlite3, supports_generated_columns from datasette.utils.sqlite import sqlite3, supports_generated_columns
from datasette.utils import Column from datasette.utils import Column
from .fixtures import app_client from .fixtures import app_client, app_client_two_attached_databases_crossdb_enabled
import pytest import pytest
import time import time
import uuid import uuid
@ -466,6 +466,15 @@ def test_is_mutable(app_client):
assert Database(app_client.ds, is_memory=True, is_mutable=False).is_mutable is False assert Database(app_client.ds, is_memory=True, is_mutable=False).is_mutable is False
@pytest.mark.asyncio
async def test_attached_databases(app_client_two_attached_databases_crossdb_enabled):
database = app_client_two_attached_databases_crossdb_enabled.ds.get_database(
"_memory"
)
attached = await database.attached_databases()
assert {a.name for a in attached} == {"extra database", "fixtures"}
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_database_memory_name(app_client): async def test_database_memory_name(app_client):
ds = app_client.ds ds = app_client.ds