--crossdb option for joining across databases (#1232)

* Test for cross-database join, refs #283
* Warn if --crossdb used with more than 10 DBs, refs #283
* latest.datasette.io demo of --crossdb joins, refs #283
* Show attached databases on /_memory page, refs #283
* Documentation for cross-database queries, refs #283
pull/1243/head
Simon Willison 2021-02-18 14:09:12 -08:00 zatwierdzone przez GitHub
rodzic 4df548e766
commit 6f41c8a2be
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
13 zmienionych plików z 215 dodań i 8 usunięć

Wyświetl plik

@ -31,7 +31,7 @@ jobs:
- name: Run tests
run: pytest
- name: Build fixtures.db
run: python tests/fixtures.py fixtures.db fixtures.json plugins
run: python tests/fixtures.py fixtures.db fixtures.json plugins --extra-db-filename extra_database.db
- name: Build docs.db
run: |-
cd docs
@ -48,12 +48,12 @@ jobs:
run: |-
gcloud config set run/region us-central1
gcloud config set project datasette-222320
datasette publish cloudrun fixtures.db \
datasette publish cloudrun fixtures.db extra_database.db \
-m fixtures.json \
--plugins-dir=plugins \
--branch=$GITHUB_SHA \
--version-note=$GITHUB_SHA \
--extra-options="--setting template_debug 1" \
--extra-options="--setting template_debug 1 --crossdb" \
--install=pysqlite3-binary \
--service=datasette-latest
# Deploy docs.db to a different service

Wyświetl plik

@ -85,6 +85,9 @@ from .version import __version__
app_root = Path(__file__).parent.parent
# https://github.com/simonw/datasette/issues/283#issuecomment-781591015
SQLITE_LIMIT_ATTACHED = 10
Setting = collections.namedtuple("Setting", ("name", "default", "help"))
SETTINGS = (
Setting("default_page_size", 100, "Default page size for the table view"),
@ -194,6 +197,7 @@ class Datasette:
version_note=None,
config_dir=None,
pdb=False,
crossdb=False,
):
assert config_dir is None or isinstance(
config_dir, Path
@ -217,7 +221,8 @@ class Datasette:
self.inspect_data = inspect_data
self.immutables = set(immutables or [])
self.databases = collections.OrderedDict()
if memory or not self.files:
self.crossdb = crossdb
if memory or crossdb or not self.files:
self.add_database(Database(self, is_memory=True), name="_memory")
# memory_name is a random string so that each Datasette instance gets its own
# unique in-memory named database - otherwise unit tests can fail with weird
@ -499,6 +504,19 @@ class Datasette:
conn.execute(f"PRAGMA cache_size=-{self.setting('cache_size_kb')}")
# pylint: disable=no-member
pm.hook.prepare_connection(conn=conn, database=database, datasette=self)
# If self.crossdb and this is _memory, connect the first SQLITE_LIMIT_ATTACHED databases
if self.crossdb and database == "_memory":
count = 0
for db_name, db in self.databases.items():
if count >= SQLITE_LIMIT_ATTACHED or db.is_memory:
continue
sql = 'ATTACH DATABASE "file:{path}?{qs}" AS [{name}];'.format(
path=db.path,
qs="mode=ro" if db.is_mutable else "immutable=1",
name=db_name,
)
conn.execute(sql)
count += 1
def add_message(self, request, message, type=INFO):
if not hasattr(request, "_messages"):

Wyświetl plik

@ -12,7 +12,7 @@ from subprocess import call
import sys
from runpy import run_module
import webbrowser
from .app import Datasette, DEFAULT_SETTINGS, SETTINGS, pm
from .app import Datasette, DEFAULT_SETTINGS, SETTINGS, SQLITE_LIMIT_ATTACHED, pm
from .utils import (
StartupError,
check_connection,
@ -410,6 +410,11 @@ def uninstall(packages, yes):
is_flag=True,
help="Create database files if they do not exist",
)
@click.option(
"--crossdb",
is_flag=True,
help="Enable cross-database joins using the /_memory database",
)
@click.option(
"--ssl-keyfile",
help="SSL key file",
@ -442,6 +447,7 @@ def serve(
pdb,
open_browser,
create,
crossdb,
ssl_keyfile,
ssl_certfile,
return_instance=False,
@ -499,6 +505,7 @@ def serve(
secret=secret,
version_note=version_note,
pdb=pdb,
crossdb=crossdb,
)
# if files is a single directory, use that as config_dir=
@ -591,3 +598,15 @@ async def check_databases(ds):
raise click.UsageError(
f"Connection to {database.path} failed check: {str(e.args[0])}"
)
# If --crossdb and more than SQLITE_LIMIT_ATTACHED show warning
if (
ds.crossdb
and len([db for db in ds.databases.values() if not db.is_memory])
> SQLITE_LIMIT_ATTACHED
):
msg = (
"Warning: --crossdb only works with the first {} attached databases".format(
SQLITE_LIMIT_ATTACHED
)
)
click.echo(click.style(msg, bold=True, fg="yellow"), err=True)

Wyświetl plik

@ -1,4 +1,5 @@
import asyncio
from collections import namedtuple
from pathlib import Path
import janus
import queue
@ -22,6 +23,8 @@ from .inspect import inspect_hash
connections = threading.local()
AttachedDatabase = namedtuple("AttachedDatabase", ("seq", "name", "file"))
class Database:
def __init__(
@ -78,7 +81,7 @@ class Database:
conn.execute("PRAGMA query_only=1")
return conn
if self.is_memory:
return sqlite3.connect(":memory:")
return sqlite3.connect(":memory:", uri=True)
# mode=ro or immutable=1?
if self.is_mutable:
qs = "?mode=ro"
@ -243,6 +246,12 @@ class Database:
return None
return Path(self.path).stat().st_mtime_ns
async def attached_databases(self):
results = await self.execute(
"select seq, name, file from pragma_database_list() where seq > 0"
)
return [AttachedDatabase(*row) for row in results.rows]
async def table_exists(self, table):
results = await self.execute(
"select 1 from sqlite_master where type='table' and name=?", params=(table,)

Wyświetl plik

@ -56,6 +56,17 @@
</form>
{% endif %}
{% if attached_databases %}
<div class="message-info">
<p>The following databases are attached to this connection, and can be used for cross-database joins:</p>
<ul class="bullets">
{% for db_name in attached_databases %}
<li><strong>{{ db_name }}</strong> - <a href="?sql=select+*+from+[{{ db_name }}].sqlite_master+where+type='table'">tables</a></li>
{% endfor %}
</ul>
</div>
{% endif %}
{% for table in tables %}
{% if show_hidden or not table.hidden %}
<div class="db-table">

Wyświetl plik

@ -115,6 +115,8 @@ class DatabaseView(DataView):
links.extend(extra_links)
return links
attached_databases = [d.name for d in await db.attached_databases()]
return (
{
"database": database,
@ -139,6 +141,7 @@ class DatabaseView(DataView):
"allow_download": self.ds.setting("allow_download")
and not db.is_mutable
and not db.is_memory,
"attached_databases": attached_databases,
},
(f"database-{to_css_class(database)}.html", "database.html"),
)

Wyświetl plik

@ -41,6 +41,7 @@ Options:
--pdb Launch debugger on any errors
-o, --open Open Datasette in your web browser
--create Create database files if they do not exist
--crossdb Enable cross-database joins using the /_memory database
--ssl-keyfile TEXT SSL key file
--ssl-certfile TEXT SSL certificate file
--help Show this message and exit.

Wyświetl plik

@ -677,6 +677,9 @@ The ``Database`` class also provides properties and methods for introspecting th
``db.is_memory`` - boolean
Is this database an in-memory database?
``await db.attached_databases()`` - list of named tuples
Returns a list of additional databases that have been connected to this database using the SQLite ATTACH command. Each named tuple has fields ``seq``, ``name`` and ``file``.
``await db.table_exists(table)`` - boolean
Check if a table called ``table`` exists.

Wyświetl plik

@ -389,3 +389,34 @@ detect if there should be another page.
Since the where clause acts against the index on the primary key, the query is
extremely fast even for records that are a long way into the overall pagination
set.
.. _cross_database_quereies:
Cross-database queries
----------------------
SQLite has the ability to run queries that join across multiple databases. Up to ten databases can be attached to a single SQLite connection and queried together.
Datasette can execute joins across multiple databases if it is started with the ``--crossdb`` option::
datasette fixtures.db extra_database.db --crossdb
If it is started in this way, the ``/_memory`` page can be used to execute queries that join across multiple databases.
References to tables in attached databases should be preceeded by the database name and a period.
For example, this query will show a list of tables across both of the above databases:
.. code-block:: sql
select
'fixtures' as database, *
from
[fixtures].sqlite_master
union
select
'extra_database' as database, *
from
[extra_database].sqlite_master
`Try that out here <https://latest.datasette.io/_memory?sql=select%0D%0A++%27fixtures%27+as+database%2C+*%0D%0Afrom%0D%0A++%5Bfixtures%5D.sqlite_master%0D%0Aunion%0D%0Aselect%0D%0A++%27extra_database%27+as+database%2C+*%0D%0Afrom%0D%0A++%5Bextra_database%5D.sqlite_master>`__.

Wyświetl plik

@ -105,6 +105,7 @@ def make_app_client(
static_mounts=None,
template_dir=None,
metadata=None,
crossdb=False,
):
with tempfile.TemporaryDirectory() as tmpdir:
filepath = os.path.join(tmpdir, filename)
@ -149,6 +150,7 @@ def make_app_client(
inspect_data=inspect_data,
static_mounts=static_mounts,
template_dir=template_dir,
crossdb=crossdb,
)
ds.sqlite_functions.append(("sleep", 1, lambda n: time.sleep(float(n))))
yield TestClient(ds)
@ -180,6 +182,15 @@ def app_client_two_attached_databases():
yield client
@pytest.fixture(scope="session")
def app_client_two_attached_databases_crossdb_enabled():
with make_app_client(
extra_databases={"extra database.db": EXTRA_DATABASE_SQL},
crossdb=True,
) as client:
yield client
@pytest.fixture(scope="session")
def app_client_conflicting_database_names():
with make_app_client(
@ -750,7 +761,12 @@ def assert_permissions_checked(datasette, actions):
default=False,
help="Delete and recreate database if it exists",
)
def cli(db_filename, metadata, plugins_path, recreate):
@click.option(
"--extra-db-filename",
type=click.Path(file_okay=True, dir_okay=False),
help="Write out second test DB to this file",
)
def cli(db_filename, metadata, plugins_path, recreate, extra_db_filename):
"""Write out the fixtures database used by Datasette's test suite"""
if metadata and not metadata.endswith(".json"):
raise click.ClickException("Metadata should end with .json")
@ -784,6 +800,17 @@ def cli(db_filename, metadata, plugins_path, recreate):
newpath = path / filepath.name
newpath.write_text(filepath.open().read())
print(f" Wrote plugin: {newpath}")
if extra_db_filename:
if pathlib.Path(extra_db_filename).exists():
if not recreate:
raise click.ClickException(
f"{extra_db_filename} already exists, use --recreate to reset it"
)
else:
pathlib.Path(extra_db_filename).unlink()
conn = sqlite3.connect(extra_db_filename)
conn.executescript(EXTRA_DATABASE_SQL)
print(f"Test tables written to {extra_db_filename}")
if __name__ == "__main__":

Wyświetl plik

@ -147,6 +147,7 @@ def test_metadata_yaml():
get=None,
help_config=False,
pdb=False,
crossdb=False,
open_browser=False,
create=False,
ssl_keyfile=None,

Wyświetl plik

@ -0,0 +1,75 @@
from datasette.cli import cli
from click.testing import CliRunner
import urllib
import sqlite3
from .fixtures import app_client_two_attached_databases_crossdb_enabled
def test_crossdb_join(app_client_two_attached_databases_crossdb_enabled):
app_client = app_client_two_attached_databases_crossdb_enabled
sql = """
select
'extra database' as db,
pk,
text1,
text2
from
[extra database].searchable
union all
select
'fixtures' as db,
pk,
text1,
text2
from
fixtures.searchable
"""
response = app_client.get(
"/_memory.json?" + urllib.parse.urlencode({"sql": sql, "_shape": "array"})
)
assert response.status == 200
assert response.json == [
{"db": "extra database", "pk": 1, "text1": "barry cat", "text2": "terry dog"},
{"db": "extra database", "pk": 2, "text1": "terry dog", "text2": "sara weasel"},
{"db": "fixtures", "pk": 1, "text1": "barry cat", "text2": "terry dog"},
{"db": "fixtures", "pk": 2, "text1": "terry dog", "text2": "sara weasel"},
]
def test_crossdb_warning_if_too_many_databases(tmp_path_factory):
db_dir = tmp_path_factory.mktemp("dbs")
dbs = []
for i in range(11):
path = str(db_dir / "db_{}.db".format(i))
conn = sqlite3.connect(path)
conn.execute("vacuum")
dbs.append(path)
runner = CliRunner(mix_stderr=False)
result = runner.invoke(
cli,
[
"serve",
"--crossdb",
"--get",
"/",
]
+ dbs,
catch_exceptions=False,
)
assert (
"Warning: --crossdb only works with the first 10 attached databases"
in result.stderr
)
def test_crossdb_attached_database_list_display(
app_client_two_attached_databases_crossdb_enabled,
):
app_client = app_client_two_attached_databases_crossdb_enabled
response = app_client.get("/_memory")
for fragment in (
"databases are attached to this connection",
"<li><strong>fixtures</strong> - ",
"<li><strong>extra database</strong> - ",
):
assert fragment in response.text

Wyświetl plik

@ -4,7 +4,7 @@ Tests for the datasette.database.Database class
from datasette.database import Database, Results, MultipleValues
from datasette.utils.sqlite import sqlite3, supports_generated_columns
from datasette.utils import Column
from .fixtures import app_client
from .fixtures import app_client, app_client_two_attached_databases_crossdb_enabled
import pytest
import time
import uuid
@ -466,6 +466,15 @@ def test_is_mutable(app_client):
assert Database(app_client.ds, is_memory=True, is_mutable=False).is_mutable is False
@pytest.mark.asyncio
async def test_attached_databases(app_client_two_attached_databases_crossdb_enabled):
database = app_client_two_attached_databases_crossdb_enabled.ds.get_database(
"_memory"
)
attached = await database.attached_databases()
assert {a.name for a in attached} == {"extra database", "fixtures"}
@pytest.mark.asyncio
async def test_database_memory_name(app_client):
ds = app_client.ds