Download individual files instead of zip bundle

pull/739/head
Kacper Kowalik (Xarthisius) 2019-09-17 15:44:56 -05:00
rodzic a2f8228b15
commit 4df4fd6ab4
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 5D21B852895192F9
2 zmienionych plików z 55 dodań i 52 usunięć

Wyświetl plik

@ -4,7 +4,6 @@ import shutil
from urllib.request import Request
from urllib.parse import urlparse, urlunparse, parse_qs
from zipfile import ZipFile
from .doi import DoiProvider
from ..utils import copytree, deep_get
@ -104,34 +103,22 @@ class Dataverse(DoiProvider):
resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8"))["data"]
# In order to fetch entire dataset we build a list of file IDs we want to fetch
# and then receive a zip file containing all of them.
# TODO: Dataverse has a limit for the zipfile size (see
# https://github.com/jupyter/repo2docker/pull/739#issuecomment-510834729)
# If size of the dataset is grater than 100MB individual files should be downloaded.
file_ids = [
str(deep_get(fobj, "dataFile.id"))
for fobj in deep_get(record, "latestVersion.files")
]
for fobj in deep_get(record, "latestVersion.files"):
file_url = "{}/api/access/datafile/{}".format(
host["url"], deep_get(fobj, "dataFile.id")
)
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])
req = Request(
"{}/api/access/datafiles/{}".format(host["url"], ",".join(file_ids))
)
file_ref = {"download": file_url, "filename": filename}
fetch_map = {key: key for key in file_ref.keys()}
dst_fname = os.path.join(output_dir, "dataverse.zip")
with self.urlopen(req) as src, open(dst_fname, "wb") as dst:
yield "Fetching files bundle\n"
shutil.copyfileobj(src, dst)
for line in self.fetch_file(file_ref, fetch_map, output_dir):
yield line
yield "Extracting files\n"
with ZipFile(dst_fname) as zfile:
zfile.extractall(path=output_dir)
os.remove(dst_fname)
new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1:
if len(new_subdirs) == 1 and os.path.isdir(new_subdirs[0]):
d = new_subdirs[0]
copytree(os.path.join(output_dir, d), output_dir)
shutil.rmtree(os.path.join(output_dir, d))

Wyświetl plik

@ -2,12 +2,10 @@ import json
import os
import pytest
from contextlib import contextmanager
from io import BytesIO
from tempfile import TemporaryDirectory, NamedTemporaryFile
from tempfile import TemporaryDirectory
from unittest.mock import patch
from urllib.request import urlopen, Request
from zipfile import ZipFile
from repo2docker.contentproviders import Dataverse
@ -82,47 +80,65 @@ def test_detect_dataverse(test_input, expected):
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
@contextmanager
def dv_archive(prefix="a_directory"):
with NamedTemporaryFile(suffix=".zip") as zfile:
with ZipFile(zfile.name, mode="w") as zip:
zip.writestr("{}/some-file.txt".format(prefix), "some content")
zip.writestr("{}/some-other-file.txt".format(prefix), "some more content")
@pytest.fixture
def dv_files(tmpdir):
yield zfile.name
f1 = tmpdir.join("some-file.txt")
f1.write("some content")
f2 = tmpdir.mkdir("directory").join("some-other-file.txt")
f2.write("some other content")
f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt")
f3.write("yet another content")
return [f1, f2, f3]
def test_dataverse_fetch():
def test_dataverse_fetch(dv_files):
mock_response_ds_query = BytesIO(
json.dumps(
{
"data": {
"latestVersion": {
"files": [{"dataFile": {"id": 1}}, {"dataFile": {"id": 2}}]
"files": [
{"dataFile": {"id": 1}, "label": "some-file.txt"},
{
"dataFile": {"id": 2},
"label": "some-other-file.txt",
"directoryLabel": "directory",
},
{
"dataFile": {"id": 3},
"label": "the-other-file.txt",
"directoryLabel": "directory/subdirectory",
},
]
}
}
}
).encode("utf-8")
)
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
dv = Dataverse()
with dv_archive() as data_local_path:
def mock_urlopen(self, req):
if isinstance(req, Request):
return mock_response_ds_query
else:
file_no = int(req.split("/")[-1]) - 1
return urlopen("file://{}".format(dv_files[file_no]))
def mock_urlopen(self, req):
if isinstance(req, Request):
if "/api/datasets" in req.full_url:
return mock_response_ds_query
elif "/api/access/datafiles" in req.full_url:
assert req.full_url.endswith("1,2")
return urlopen("file://{}".format(data_local_path))
with patch.object(Dataverse, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(spec, d):
output.append(l)
with patch.object(Dataverse, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
assert expected == unpacked_files
unpacked_files = set(os.listdir(d))
expected = set(["directory", "some-file.txt"])
assert expected == unpacked_files
assert os.path.isfile(
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
)