kopia lustrzana https://github.com/simonw/datasette
Streaming mode for downloading all rows as a CSV (#315)
* table.csv?_stream=1 to download all rows - refs #266 This option causes Datasette to serve ALL rows in the table, by internally following the _next= pagination links and serving everything out as a stream. Also added new config option, allow_csv_stream, which can be used to disable this feature. * New config option max_csv_mb limiting size of CSV exportpull/375/head
rodzic
0d7ba1ba67
commit
fc3660cfad
|
@ -13,6 +13,7 @@ script:
|
||||||
jobs:
|
jobs:
|
||||||
include:
|
include:
|
||||||
- stage: deploy latest.datasette.io
|
- stage: deploy latest.datasette.io
|
||||||
|
if: branch = master AND type = push
|
||||||
script:
|
script:
|
||||||
- pip install .
|
- pip install .
|
||||||
- npm install -g now
|
- npm install -g now
|
||||||
|
@ -23,7 +24,6 @@ jobs:
|
||||||
- now alias --token=$NOW_TOKEN
|
- now alias --token=$NOW_TOKEN
|
||||||
- echo "{\"name\":\"datasette-latest-$ALIAS\",\"alias\":\"$ALIAS.datasette.io\"}" > now.json
|
- echo "{\"name\":\"datasette-latest-$ALIAS\",\"alias\":\"$ALIAS.datasette.io\"}" > now.json
|
||||||
- now alias --token=$NOW_TOKEN
|
- now alias --token=$NOW_TOKEN
|
||||||
on: master
|
|
||||||
- stage: release tagged version
|
- stage: release tagged version
|
||||||
if: tag IS present
|
if: tag IS present
|
||||||
python: 3.6
|
python: 3.6
|
||||||
|
|
|
@ -94,6 +94,12 @@ CONFIG_OPTIONS = (
|
||||||
ConfigOption("cache_size_kb", 0, """
|
ConfigOption("cache_size_kb", 0, """
|
||||||
SQLite cache size in KB (0 == use SQLite default)
|
SQLite cache size in KB (0 == use SQLite default)
|
||||||
""".strip()),
|
""".strip()),
|
||||||
|
ConfigOption("allow_csv_stream", True, """
|
||||||
|
Allow .csv?_stream=1 to download all rows (ignoring max_returned_rows)
|
||||||
|
""".strip()),
|
||||||
|
ConfigOption("max_csv_mb", 100, """
|
||||||
|
Maximum size allowed for CSV export in MB. Set 0 to disable this limit.
|
||||||
|
""".strip()),
|
||||||
)
|
)
|
||||||
DEFAULT_CONFIG = {
|
DEFAULT_CONFIG = {
|
||||||
option.name: option.default
|
option.name: option.default
|
||||||
|
|
|
@ -832,3 +832,22 @@ def value_as_boolean(value):
|
||||||
|
|
||||||
class ValueAsBooleanError(ValueError):
|
class ValueAsBooleanError(ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class WriteLimitExceeded(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class LimitedWriter:
|
||||||
|
def __init__(self, writer, limit_mb):
|
||||||
|
self.writer = writer
|
||||||
|
self.limit_bytes = limit_mb * 1024 * 1024
|
||||||
|
self.bytes_count = 0
|
||||||
|
|
||||||
|
def write(self, bytes):
|
||||||
|
self.bytes_count += len(bytes)
|
||||||
|
if self.limit_bytes and (self.bytes_count > self.limit_bytes):
|
||||||
|
raise WriteLimitExceeded("CSV contains more than {} bytes".format(
|
||||||
|
self.limit_bytes
|
||||||
|
))
|
||||||
|
self.writer.write(bytes)
|
||||||
|
|
|
@ -16,6 +16,7 @@ from datasette.utils import (
|
||||||
CustomJSONEncoder,
|
CustomJSONEncoder,
|
||||||
InterruptedError,
|
InterruptedError,
|
||||||
InvalidSql,
|
InvalidSql,
|
||||||
|
LimitedWriter,
|
||||||
path_from_row_pks,
|
path_from_row_pks,
|
||||||
path_with_added_args,
|
path_with_added_args,
|
||||||
path_with_format,
|
path_with_format,
|
||||||
|
@ -150,13 +151,23 @@ class BaseView(RenderMixin):
|
||||||
return await self.view_get(request, name, hash, **kwargs)
|
return await self.view_get(request, name, hash, **kwargs)
|
||||||
|
|
||||||
async def as_csv(self, request, name, hash, **kwargs):
|
async def as_csv(self, request, name, hash, **kwargs):
|
||||||
|
stream = request.args.get("_stream")
|
||||||
|
if stream:
|
||||||
|
# Some quick sanity checks
|
||||||
|
if not self.ds.config["allow_csv_stream"]:
|
||||||
|
raise DatasetteError("CSV streaming is disabled", status=400)
|
||||||
|
if request.args.get("_next"):
|
||||||
|
raise DatasetteError(
|
||||||
|
"_next not allowed for CSV streaming", status=400
|
||||||
|
)
|
||||||
|
kwargs["_size"] = "max"
|
||||||
|
# Fetch the first page
|
||||||
try:
|
try:
|
||||||
response_or_template_contexts = await self.data(
|
response_or_template_contexts = await self.data(
|
||||||
request, name, hash, **kwargs
|
request, name, hash, **kwargs
|
||||||
)
|
)
|
||||||
if isinstance(response_or_template_contexts, response.HTTPResponse):
|
if isinstance(response_or_template_contexts, response.HTTPResponse):
|
||||||
return response_or_template_contexts
|
return response_or_template_contexts
|
||||||
|
|
||||||
else:
|
else:
|
||||||
data, extra_template_data, templates = response_or_template_contexts
|
data, extra_template_data, templates = response_or_template_contexts
|
||||||
except (sqlite3.OperationalError, InvalidSql) as e:
|
except (sqlite3.OperationalError, InvalidSql) as e:
|
||||||
|
@ -167,6 +178,7 @@ class BaseView(RenderMixin):
|
||||||
|
|
||||||
except DatasetteError:
|
except DatasetteError:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# Convert rows and columns to CSV
|
# Convert rows and columns to CSV
|
||||||
headings = data["columns"]
|
headings = data["columns"]
|
||||||
# if there are expanded_columns we need to add additional headings
|
# if there are expanded_columns we need to add additional headings
|
||||||
|
@ -179,8 +191,22 @@ class BaseView(RenderMixin):
|
||||||
headings.append("{}_label".format(column))
|
headings.append("{}_label".format(column))
|
||||||
|
|
||||||
async def stream_fn(r):
|
async def stream_fn(r):
|
||||||
writer = csv.writer(r)
|
nonlocal data
|
||||||
|
writer = csv.writer(LimitedWriter(r, self.ds.config["max_csv_mb"]))
|
||||||
|
first = True
|
||||||
|
next = None
|
||||||
|
while first or (next and stream):
|
||||||
|
try:
|
||||||
|
if next:
|
||||||
|
kwargs["_next"] = next
|
||||||
|
if not first:
|
||||||
|
data, extra_template_data, templates = await self.data(
|
||||||
|
request, name, hash, **kwargs
|
||||||
|
)
|
||||||
|
if first:
|
||||||
writer.writerow(headings)
|
writer.writerow(headings)
|
||||||
|
first = False
|
||||||
|
next = data.get("next")
|
||||||
for row in data["rows"]:
|
for row in data["rows"]:
|
||||||
if not expanded_columns:
|
if not expanded_columns:
|
||||||
# Simple path
|
# Simple path
|
||||||
|
@ -195,6 +221,10 @@ class BaseView(RenderMixin):
|
||||||
else:
|
else:
|
||||||
new_row.append(cell)
|
new_row.append(cell)
|
||||||
writer.writerow(new_row)
|
writer.writerow(new_row)
|
||||||
|
except Exception as e:
|
||||||
|
print('caught this', e)
|
||||||
|
r.write(str(e))
|
||||||
|
return
|
||||||
|
|
||||||
content_type = "text/plain; charset=utf-8"
|
content_type = "text/plain; charset=utf-8"
|
||||||
headers = {}
|
headers = {}
|
||||||
|
@ -393,7 +423,8 @@ class BaseView(RenderMixin):
|
||||||
return r
|
return r
|
||||||
|
|
||||||
async def custom_sql(
|
async def custom_sql(
|
||||||
self, request, name, hash, sql, editable=True, canned_query=None
|
self, request, name, hash, sql, editable=True, canned_query=None,
|
||||||
|
_size=None
|
||||||
):
|
):
|
||||||
params = request.raw_args
|
params = request.raw_args
|
||||||
if "sql" in params:
|
if "sql" in params:
|
||||||
|
@ -415,6 +446,8 @@ class BaseView(RenderMixin):
|
||||||
extra_args = {}
|
extra_args = {}
|
||||||
if params.get("_timelimit"):
|
if params.get("_timelimit"):
|
||||||
extra_args["custom_time_limit"] = int(params["_timelimit"])
|
extra_args["custom_time_limit"] = int(params["_timelimit"])
|
||||||
|
if _size:
|
||||||
|
extra_args["page_size"] = _size
|
||||||
results = await self.ds.execute(
|
results = await self.ds.execute(
|
||||||
name, sql, params, truncate=True, **extra_args
|
name, sql, params, truncate=True, **extra_args
|
||||||
)
|
)
|
||||||
|
|
|
@ -9,13 +9,13 @@ from .base import BaseView, DatasetteError
|
||||||
|
|
||||||
class DatabaseView(BaseView):
|
class DatabaseView(BaseView):
|
||||||
|
|
||||||
async def data(self, request, name, hash, default_labels=False):
|
async def data(self, request, name, hash, default_labels=False, _size=None):
|
||||||
if request.args.get("sql"):
|
if request.args.get("sql"):
|
||||||
if not self.ds.config["allow_sql"]:
|
if not self.ds.config["allow_sql"]:
|
||||||
raise DatasetteError("sql= is not allowed", status=400)
|
raise DatasetteError("sql= is not allowed", status=400)
|
||||||
sql = request.raw_args.pop("sql")
|
sql = request.raw_args.pop("sql")
|
||||||
validate_sql_select(sql)
|
validate_sql_select(sql)
|
||||||
return await self.custom_sql(request, name, hash, sql)
|
return await self.custom_sql(request, name, hash, sql, _size=_size)
|
||||||
|
|
||||||
info = self.ds.inspect()[name]
|
info = self.ds.inspect()[name]
|
||||||
metadata = self.ds.metadata.get("databases", {}).get(name, {})
|
metadata = self.ds.metadata.get("databases", {}).get(name, {})
|
||||||
|
|
|
@ -220,7 +220,7 @@ class RowTableShared(BaseView):
|
||||||
|
|
||||||
class TableView(RowTableShared):
|
class TableView(RowTableShared):
|
||||||
|
|
||||||
async def data(self, request, name, hash, table, default_labels=False):
|
async def data(self, request, name, hash, table, default_labels=False, _next=None, _size=None):
|
||||||
canned_query = self.ds.get_canned_query(name, table)
|
canned_query = self.ds.get_canned_query(name, table)
|
||||||
if canned_query is not None:
|
if canned_query is not None:
|
||||||
return await self.custom_sql(
|
return await self.custom_sql(
|
||||||
|
@ -375,7 +375,7 @@ class TableView(RowTableShared):
|
||||||
|
|
||||||
count_sql = "select count(*) {}".format(from_sql)
|
count_sql = "select count(*) {}".format(from_sql)
|
||||||
|
|
||||||
_next = special_args.get("_next")
|
_next = _next or special_args.get("_next")
|
||||||
offset = ""
|
offset = ""
|
||||||
if _next:
|
if _next:
|
||||||
if is_view:
|
if is_view:
|
||||||
|
@ -462,7 +462,7 @@ class TableView(RowTableShared):
|
||||||
|
|
||||||
extra_args = {}
|
extra_args = {}
|
||||||
# Handle ?_size=500
|
# Handle ?_size=500
|
||||||
page_size = request.raw_args.get("_size")
|
page_size = _size or request.raw_args.get("_size")
|
||||||
if page_size:
|
if page_size:
|
||||||
if page_size == "max":
|
if page_size == "max":
|
||||||
page_size = self.max_returned_rows
|
page_size = self.max_returned_rows
|
||||||
|
@ -512,6 +512,8 @@ class TableView(RowTableShared):
|
||||||
facet_results = {}
|
facet_results = {}
|
||||||
facets_timed_out = []
|
facets_timed_out = []
|
||||||
for column in facets:
|
for column in facets:
|
||||||
|
if _next:
|
||||||
|
continue
|
||||||
facet_sql = """
|
facet_sql = """
|
||||||
select {col} as value, count(*) as count
|
select {col} as value, count(*) as count
|
||||||
{from_sql} {and_or_where} {col} is not null
|
{from_sql} {and_or_where} {col} is not null
|
||||||
|
@ -665,6 +667,8 @@ class TableView(RowTableShared):
|
||||||
for facet_column in columns:
|
for facet_column in columns:
|
||||||
if facet_column in facets:
|
if facet_column in facets:
|
||||||
continue
|
continue
|
||||||
|
if _next:
|
||||||
|
continue
|
||||||
if not self.ds.config["suggest_facets"]:
|
if not self.ds.config["suggest_facets"]:
|
||||||
continue
|
continue
|
||||||
suggested_facet_sql = '''
|
suggested_facet_sql = '''
|
||||||
|
|
|
@ -125,3 +125,24 @@ Sets the amount of memory SQLite uses for its `per-connection cache <https://www
|
||||||
::
|
::
|
||||||
|
|
||||||
datasette mydatabase.db --config cache_size_kb:5000
|
datasette mydatabase.db --config cache_size_kb:5000
|
||||||
|
|
||||||
|
|
||||||
|
allow_csv_stream
|
||||||
|
----------------
|
||||||
|
|
||||||
|
Enables the feature where an entire table (potentially hundreds of thousands of
|
||||||
|
rows) can be exported as a single CSV file. This is turned on by default - you
|
||||||
|
can turn it off like this::
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
datasette mydatabase.db --config allow_csv_stream:off
|
||||||
|
|
||||||
|
|
||||||
|
max_csv_mb
|
||||||
|
----------
|
||||||
|
|
||||||
|
The maximum size of CSV that can be exported, in megabytes. Defaults to 100MB.
|
||||||
|
You can disable the limit entirely by settings this to 0::
|
||||||
|
|
||||||
|
datasette mydatabase.db --config max_csv_mb:0
|
||||||
|
|
|
@ -122,6 +122,7 @@ This will restrict sorting of ``example_table`` to just the ``height`` and
|
||||||
You can also disable sorting entirely by setting ``"sortable_columns": []``
|
You can also disable sorting entirely by setting ``"sortable_columns": []``
|
||||||
|
|
||||||
.. _label_columns:
|
.. _label_columns:
|
||||||
|
|
||||||
Specifying the label column for a table
|
Specifying the label column for a table
|
||||||
---------------------------------------
|
---------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -71,6 +71,13 @@ def app_client_larger_cache_size():
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope='session')
|
||||||
|
def app_client_csv_max_mb_one():
|
||||||
|
yield from app_client(config={
|
||||||
|
'max_csv_mb': 1,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
def generate_compound_rows(num):
|
def generate_compound_rows(num):
|
||||||
for a, b, c in itertools.islice(
|
for a, b, c in itertools.islice(
|
||||||
itertools.product(string.ascii_lowercase, repeat=3), num
|
itertools.product(string.ascii_lowercase, repeat=3), num
|
||||||
|
|
|
@ -901,6 +901,8 @@ def test_config_json(app_client):
|
||||||
"default_cache_ttl": 365 * 24 * 60 * 60,
|
"default_cache_ttl": 365 * 24 * 60 * 60,
|
||||||
"num_sql_threads": 3,
|
"num_sql_threads": 3,
|
||||||
"cache_size_kb": 0,
|
"cache_size_kb": 0,
|
||||||
|
"allow_csv_stream": True,
|
||||||
|
"max_csv_mb": 100,
|
||||||
} == response.json
|
} == response.json
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from .fixtures import app_client # noqa
|
from .fixtures import app_client, app_client_csv_max_mb_one # noqa
|
||||||
|
|
||||||
EXPECTED_TABLE_CSV = '''id,content
|
EXPECTED_TABLE_CSV = '''id,content
|
||||||
1,hello
|
1,hello
|
||||||
|
@ -59,3 +59,28 @@ def test_table_csv_download(app_client):
|
||||||
assert 'text/csv; charset=utf-8' == response.headers['Content-Type']
|
assert 'text/csv; charset=utf-8' == response.headers['Content-Type']
|
||||||
expected_disposition = 'attachment; filename="simple_primary_key.csv"'
|
expected_disposition = 'attachment; filename="simple_primary_key.csv"'
|
||||||
assert expected_disposition == response.headers['Content-Disposition']
|
assert expected_disposition == response.headers['Content-Disposition']
|
||||||
|
|
||||||
|
|
||||||
|
def test_max_csv_mb(app_client_csv_max_mb_one):
|
||||||
|
response = app_client_csv_max_mb_one.get(
|
||||||
|
"/fixtures.csv?sql=select+randomblob(10000)+"
|
||||||
|
"from+compound_three_primary_keys&_stream=1&_size=max"
|
||||||
|
)
|
||||||
|
# It's a 200 because we started streaming before we knew the error
|
||||||
|
assert response.status == 200
|
||||||
|
# Last line should be an error message
|
||||||
|
last_line = [line for line in response.body.split(b"\r\n") if line][-1]
|
||||||
|
assert last_line.startswith(b"CSV contains more than")
|
||||||
|
|
||||||
|
|
||||||
|
def test_table_csv_stream(app_client):
|
||||||
|
# Without _stream should return header + 100 rows:
|
||||||
|
response = app_client.get(
|
||||||
|
"/fixtures/compound_three_primary_keys.csv?_size=max"
|
||||||
|
)
|
||||||
|
assert 101 == len([b for b in response.body.split(b"\r\n") if b])
|
||||||
|
# With _stream=1 should return header + 1001 rows
|
||||||
|
response = app_client.get(
|
||||||
|
"/fixtures/compound_three_primary_keys.csv?_stream=1"
|
||||||
|
)
|
||||||
|
assert 1002 == len([b for b in response.body.split(b"\r\n") if b])
|
||||||
|
|
Ładowanie…
Reference in New Issue