Reworked metadata building options

Building metadata is now optional. If you want to do it, do this:

    datasette build *.db --metadata=metadata.json

Then when you run the server you can tell it to read from metadata:

    datasette serve *.db --metadata=metadata.json

The Dockerfile generated by datasette publish now uses this mechanism.

Closes #60
pull/81/head
Simon Willison 2017-11-11 12:10:51 -08:00
rodzic ad8b5d3bd2
commit 40a563ebac
4 zmienionych plików z 98 dodań i 89 usunięć

Wyświetl plik

@ -27,56 +27,17 @@ from .utils import (
app_root = Path(__file__).parent.parent app_root = Path(__file__).parent.parent
BUILD_METADATA = 'build-metadata.json'
HASH_BLOCK_SIZE = 1024 * 1024 HASH_BLOCK_SIZE = 1024 * 1024
SQL_TIME_LIMIT_MS = 1000 SQL_TIME_LIMIT_MS = 1000
connections = threading.local() connections = threading.local()
def ensure_build_metadata(files, regenerate=True):
build_metadata = app_root / BUILD_METADATA
if build_metadata.exists() and not regenerate:
return json.loads(build_metadata.read_text())
print('Building metadata... path={}'.format(build_metadata))
metadata = {}
for filename in files:
path = Path(filename)
name = path.stem
if name in metadata:
raise Exception('Multiple files with same stem %s' % name)
# Calculate hash, efficiently
m = hashlib.sha256()
with path.open('rb') as fp:
while True:
data = fp.read(HASH_BLOCK_SIZE)
if not data:
break
m.update(data)
# List tables and their row counts
tables = {}
with sqlite3.connect('file:{}?immutable=1'.format(path.name), uri=True) as conn:
conn.row_factory = sqlite3.Row
table_names = [
r['name']
for r in conn.execute('select * from sqlite_master where type="table"')
]
for table in table_names:
tables[table] = conn.execute('select count(*) from "{}"'.format(table)).fetchone()[0]
metadata[name] = {
'hash': m.hexdigest(),
'file': path.name,
'tables': tables,
}
build_metadata.write_text(json.dumps(metadata, indent=4))
return metadata
class BaseView(HTTPMethodView): class BaseView(HTTPMethodView):
template = None template = None
def __init__(self, datasette): def __init__(self, datasette):
self.ds = datasette
self.files = datasette.files self.files = datasette.files
self.jinja = datasette.jinja self.jinja = datasette.jinja
self.executor = datasette.executor self.executor = datasette.executor
@ -103,12 +64,45 @@ class BaseView(HTTPMethodView):
rows.sort(key=lambda row: row[-1]) rows.sort(key=lambda row: row[-1])
return [str(r[1]) for r in rows] return [str(r[1]) for r in rows]
def resolve_db_name(self, db_name, **kwargs):
databases = self.ds.metadata()
hash = None
name = None
if '-' in db_name:
# Might be name-and-hash, or might just be
# a name with a hyphen in it
name, hash = db_name.rsplit('-', 1)
if name not in databases:
# Try the whole name
name = db_name
hash = None
else:
name = db_name
# Verify the hash
try:
info = databases[name]
except KeyError:
raise NotFound('Database not found: {}'.format(name))
expected = info['hash'][:7]
if expected != hash:
should_redirect = '/{}-{}'.format(
name, expected,
)
if 'table' in kwargs:
should_redirect += '/' + kwargs['table']
if 'as_json' in kwargs:
should_redirect += kwargs['as_json']
if 'as_db' in kwargs:
should_redirect += kwargs['as_db']
return name, expected, should_redirect
return name, expected, None
async def execute(self, db_name, sql, params=None): async def execute(self, db_name, sql, params=None):
"""Executes sql against db_name in a thread""" """Executes sql against db_name in a thread"""
def sql_operation_in_thread(): def sql_operation_in_thread():
conn = getattr(connections, db_name, None) conn = getattr(connections, db_name, None)
if not conn: if not conn:
info = ensure_build_metadata(self.files)[db_name] info = self.ds.metadata()[db_name]
conn = sqlite3.connect( conn = sqlite3.connect(
'file:{}?immutable=1'.format(info['file']), 'file:{}?immutable=1'.format(info['file']),
uri=True, uri=True,
@ -133,7 +127,7 @@ class BaseView(HTTPMethodView):
) )
async def get(self, request, db_name, **kwargs): async def get(self, request, db_name, **kwargs):
name, hash, should_redirect = resolve_db_name(self.files, db_name, **kwargs) name, hash, should_redirect = self.resolve_db_name(db_name, **kwargs)
if should_redirect: if should_redirect:
return self.redirect(request, should_redirect) return self.redirect(request, should_redirect)
return await self.view_get(request, name, hash, **kwargs) return await self.view_get(request, name, hash, **kwargs)
@ -196,13 +190,14 @@ class BaseView(HTTPMethodView):
class IndexView(HTTPMethodView): class IndexView(HTTPMethodView):
def __init__(self, datasette): def __init__(self, datasette):
self.ds = datasette
self.files = datasette.files self.files = datasette.files
self.jinja = datasette.jinja self.jinja = datasette.jinja
self.executor = datasette.executor self.executor = datasette.executor
async def get(self, request, as_json): async def get(self, request, as_json):
databases = [] databases = []
for key, info in sorted(ensure_build_metadata(self.files).items()): for key, info in sorted(self.ds.metadata().items()):
database = { database = {
'name': key, 'name': key,
'hash': info['hash'], 'hash': info['hash'],
@ -263,7 +258,7 @@ class DatabaseView(BaseView):
class DatabaseDownload(BaseView): class DatabaseDownload(BaseView):
async def view_get(self, request, name, hash, **kwargs): async def view_get(self, request, name, hash, **kwargs):
filepath = ensure_build_metadata(self.files)[name]['file'] filepath = self.ds.metadata()[name]['file']
return await response.file_stream( return await response.file_stream(
filepath, headers={ filepath, headers={
'Content-Disposition': 'attachment; filename="{}"'.format(filepath) 'Content-Disposition': 'attachment; filename="{}"'.format(filepath)
@ -339,7 +334,7 @@ class TableView(BaseView):
if use_rowid: if use_rowid:
display_columns = display_columns[1:] display_columns = display_columns[1:]
rows = list(rows) rows = list(rows)
info = ensure_build_metadata(self.files) info = self.ds.metadata()
total_rows = info[name]['tables'].get(table) total_rows = info[name]['tables'].get(table)
after = None after = None
after_link = None after_link = None
@ -404,42 +399,8 @@ class RowView(BaseView):
} }
def resolve_db_name(files, db_name, **kwargs):
databases = ensure_build_metadata(files)
hash = None
name = None
if '-' in db_name:
# Might be name-and-hash, or might just be
# a name with a hyphen in it
name, hash = db_name.rsplit('-', 1)
if name not in databases:
# Try the whole name
name = db_name
hash = None
else:
name = db_name
# Verify the hash
try:
info = databases[name]
except KeyError:
raise NotFound('Database not found: {}'.format(name))
expected = info['hash'][:7]
if expected != hash:
should_redirect = '/{}-{}'.format(
name, expected,
)
if 'table' in kwargs:
should_redirect += '/' + kwargs['table']
if 'as_json' in kwargs:
should_redirect += kwargs['as_json']
if 'as_db' in kwargs:
should_redirect += kwargs['as_db']
return name, expected, should_redirect
return name, expected, None
class Datasette: class Datasette:
def __init__(self, files, num_threads=3, cache_headers=True, page_size=50): def __init__(self, files, num_threads=3, cache_headers=True, page_size=50, metadata=None):
self.files = files self.files = files
self.num_threads = num_threads self.num_threads = num_threads
self.executor = futures.ThreadPoolExecutor( self.executor = futures.ThreadPoolExecutor(
@ -447,6 +408,43 @@ class Datasette:
) )
self.cache_headers = cache_headers self.cache_headers = cache_headers
self.page_size = page_size self.page_size = page_size
self._metadata = metadata
def metadata(self):
if self._metadata:
return self._metadata
metadata = {}
for filename in self.files:
path = Path(filename)
name = path.stem
if name in metadata:
raise Exception('Multiple files with same stem %s' % name)
# Calculate hash, efficiently
m = hashlib.sha256()
with path.open('rb') as fp:
while True:
data = fp.read(HASH_BLOCK_SIZE)
if not data:
break
m.update(data)
# List tables and their row counts
tables = {}
with sqlite3.connect('file:{}?immutable=1'.format(path.name), uri=True) as conn:
conn.row_factory = sqlite3.Row
table_names = [
r['name']
for r in conn.execute('select * from sqlite_master where type="table"')
]
for table in table_names:
tables[table] = conn.execute('select count(*) from "{}"'.format(table)).fetchone()[0]
metadata[name] = {
'hash': m.hexdigest(),
'file': path.name,
'tables': tables,
}
self._metadata = metadata
return metadata
def app(self): def app(self):
app = Sanic(__name__) app = Sanic(__name__)

Wyświetl plik

@ -1,11 +1,12 @@
import click import click
from click_default_group import DefaultGroup from click_default_group import DefaultGroup
import json
import os import os
import shutil import shutil
from subprocess import call from subprocess import call
import sys import sys
import tempfile import tempfile
from .app import Datasette, ensure_build_metadata from .app import Datasette
from .utils import make_dockerfile from .utils import make_dockerfile
@ -18,8 +19,10 @@ def cli():
@cli.command() @cli.command()
@click.argument('files', type=click.Path(exists=True), nargs=-1) @click.argument('files', type=click.Path(exists=True), nargs=-1)
def build(files): @click.option('-m', '--metadata', default='metadata.json')
ensure_build_metadata(files, True) def build_metadata(files, metadata):
app = Datasette(files)
open(metadata, 'w').write(json.dumps(app.metadata(), indent=2))
@cli.command() @cli.command()
@ -62,12 +65,20 @@ def publish(files):
@click.option('-p', '--port', default=8001) @click.option('-p', '--port', default=8001)
@click.option('--debug', is_flag=True) @click.option('--debug', is_flag=True)
@click.option('--reload', is_flag=True) @click.option('--reload', is_flag=True)
def serve(files, host, port, debug, reload): @click.option('-m', '--metadata')
def serve(files, host, port, debug, reload, metadata):
"""Serve up specified database files with a web UI""" """Serve up specified database files with a web UI"""
if reload: if reload:
import hupper import hupper
hupper.start_reloader('datasette.cli.serve') hupper.start_reloader('datasette.cli.serve')
if metadata:
metadata = json.load(open(metadata))
click.echo('Serve! files={} on port {}'.format(files, port)) click.echo('Serve! files={} on port {}'.format(files, port))
app = Datasette(files, cache_headers=not debug and not reload).app() app = Datasette(
files,
cache_headers=not debug and not reload,
metadata=metadata,
).app()
app.run(host=host, port=port, debug=debug) app.run(host=host, port=port, debug=debug)

Wyświetl plik

@ -122,10 +122,10 @@ def make_dockerfile(files):
FROM python:3 FROM python:3
COPY . /app COPY . /app
WORKDIR /app WORKDIR /app
RUN pip install https://static.simonwillison.net/static/2017/datasette-0.1-py3-none-any.whl RUN pip install https://static.simonwillison.net/static/2017/datasette-0.2-py3-none-any.whl
RUN datasette build {} RUN datasette build_metadata {} --metadata metadata.json
EXPOSE 8006 EXPOSE 8006
CMD ["datasette", "serve", {}, "--port", "8006"]'''.format( CMD ["datasette", "serve", {}, "--port", "8006", "--metadata", "metadata.json"]'''.format(
' '.join(files), ' '.join(files),
'"' + '", "'.join(files) + '"', '"' + '", "'.join(files) + '"',
).strip() ).strip()

Wyświetl plik

@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup( setup(
name='datasette', name='datasette',
version='0.1', version='0.2',
packages=find_packages(), packages=find_packages(),
package_data={'datasette': ['templates/*.html']}, package_data={'datasette': ['templates/*.html']},
include_package_data=True, include_package_data=True,