Download individual files instead of zip bundle

pull/739/head
Kacper Kowalik (Xarthisius) 2019-09-17 15:44:56 -05:00
rodzic a2f8228b15
commit 4df4fd6ab4
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 5D21B852895192F9
2 zmienionych plików z 55 dodań i 52 usunięć

Wyświetl plik

@ -4,7 +4,6 @@ import shutil
from urllib.request import Request from urllib.request import Request
from urllib.parse import urlparse, urlunparse, parse_qs from urllib.parse import urlparse, urlunparse, parse_qs
from zipfile import ZipFile
from .doi import DoiProvider from .doi import DoiProvider
from ..utils import copytree, deep_get from ..utils import copytree, deep_get
@ -104,34 +103,22 @@ class Dataverse(DoiProvider):
resp = self.urlopen(req) resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8"))["data"] record = json.loads(resp.read().decode("utf-8"))["data"]
# In order to fetch entire dataset we build a list of file IDs we want to fetch for fobj in deep_get(record, "latestVersion.files"):
# and then receive a zip file containing all of them. file_url = "{}/api/access/datafile/{}".format(
# TODO: Dataverse has a limit for the zipfile size (see host["url"], deep_get(fobj, "dataFile.id")
# https://github.com/jupyter/repo2docker/pull/739#issuecomment-510834729)
# If size of the dataset is grater than 100MB individual files should be downloaded.
file_ids = [
str(deep_get(fobj, "dataFile.id"))
for fobj in deep_get(record, "latestVersion.files")
]
req = Request(
"{}/api/access/datafiles/{}".format(host["url"], ",".join(file_ids))
) )
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])
dst_fname = os.path.join(output_dir, "dataverse.zip") file_ref = {"download": file_url, "filename": filename}
with self.urlopen(req) as src, open(dst_fname, "wb") as dst: fetch_map = {key: key for key in file_ref.keys()}
yield "Fetching files bundle\n"
shutil.copyfileobj(src, dst)
yield "Extracting files\n" for line in self.fetch_file(file_ref, fetch_map, output_dir):
with ZipFile(dst_fname) as zfile: yield line
zfile.extractall(path=output_dir)
os.remove(dst_fname)
new_subdirs = os.listdir(output_dir) new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents # if there is only one new subdirectory move its contents
# to the top level directory # to the top level directory
if len(new_subdirs) == 1: if len(new_subdirs) == 1 and os.path.isdir(new_subdirs[0]):
d = new_subdirs[0] d = new_subdirs[0]
copytree(os.path.join(output_dir, d), output_dir) copytree(os.path.join(output_dir, d), output_dir)
shutil.rmtree(os.path.join(output_dir, d)) shutil.rmtree(os.path.join(output_dir, d))

Wyświetl plik

@ -2,12 +2,10 @@ import json
import os import os
import pytest import pytest
from contextlib import contextmanager
from io import BytesIO from io import BytesIO
from tempfile import TemporaryDirectory, NamedTemporaryFile from tempfile import TemporaryDirectory
from unittest.mock import patch from unittest.mock import patch
from urllib.request import urlopen, Request from urllib.request import urlopen, Request
from zipfile import ZipFile
from repo2docker.contentproviders import Dataverse from repo2docker.contentproviders import Dataverse
@ -82,40 +80,55 @@ def test_detect_dataverse(test_input, expected):
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
@contextmanager @pytest.fixture
def dv_archive(prefix="a_directory"): def dv_files(tmpdir):
with NamedTemporaryFile(suffix=".zip") as zfile:
with ZipFile(zfile.name, mode="w") as zip:
zip.writestr("{}/some-file.txt".format(prefix), "some content")
zip.writestr("{}/some-other-file.txt".format(prefix), "some more content")
yield zfile.name f1 = tmpdir.join("some-file.txt")
f1.write("some content")
f2 = tmpdir.mkdir("directory").join("some-other-file.txt")
f2.write("some other content")
f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt")
f3.write("yet another content")
return [f1, f2, f3]
def test_dataverse_fetch(): def test_dataverse_fetch(dv_files):
mock_response_ds_query = BytesIO( mock_response_ds_query = BytesIO(
json.dumps( json.dumps(
{ {
"data": { "data": {
"latestVersion": { "latestVersion": {
"files": [{"dataFile": {"id": 1}}, {"dataFile": {"id": 2}}] "files": [
{"dataFile": {"id": 1}, "label": "some-file.txt"},
{
"dataFile": {"id": 2},
"label": "some-other-file.txt",
"directoryLabel": "directory",
},
{
"dataFile": {"id": 3},
"label": "the-other-file.txt",
"directoryLabel": "directory/subdirectory",
},
]
} }
} }
} }
).encode("utf-8") ).encode("utf-8")
) )
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
dv = Dataverse()
with dv_archive() as data_local_path: dv = Dataverse()
def mock_urlopen(self, req): def mock_urlopen(self, req):
if isinstance(req, Request): if isinstance(req, Request):
if "/api/datasets" in req.full_url:
return mock_response_ds_query return mock_response_ds_query
elif "/api/access/datafiles" in req.full_url: else:
assert req.full_url.endswith("1,2") file_no = int(req.split("/")[-1]) - 1
return urlopen("file://{}".format(data_local_path)) return urlopen("file://{}".format(dv_files[file_no]))
with patch.object(Dataverse, "urlopen", new=mock_urlopen): with patch.object(Dataverse, "urlopen", new=mock_urlopen):
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
@ -124,5 +137,8 @@ def test_dataverse_fetch():
output.append(l) output.append(l)
unpacked_files = set(os.listdir(d)) unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"]) expected = set(["directory", "some-file.txt"])
assert expected == unpacked_files assert expected == unpacked_files
assert os.path.isfile(
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
)