kopia lustrzana https://github.com/jupyterhub/repo2docker
Download individual files instead of zip bundle
rodzic
a2f8228b15
commit
4df4fd6ab4
|
@ -4,7 +4,6 @@ import shutil
|
||||||
|
|
||||||
from urllib.request import Request
|
from urllib.request import Request
|
||||||
from urllib.parse import urlparse, urlunparse, parse_qs
|
from urllib.parse import urlparse, urlunparse, parse_qs
|
||||||
from zipfile import ZipFile
|
|
||||||
|
|
||||||
from .doi import DoiProvider
|
from .doi import DoiProvider
|
||||||
from ..utils import copytree, deep_get
|
from ..utils import copytree, deep_get
|
||||||
|
@ -104,34 +103,22 @@ class Dataverse(DoiProvider):
|
||||||
resp = self.urlopen(req)
|
resp = self.urlopen(req)
|
||||||
record = json.loads(resp.read().decode("utf-8"))["data"]
|
record = json.loads(resp.read().decode("utf-8"))["data"]
|
||||||
|
|
||||||
# In order to fetch entire dataset we build a list of file IDs we want to fetch
|
for fobj in deep_get(record, "latestVersion.files"):
|
||||||
# and then receive a zip file containing all of them.
|
file_url = "{}/api/access/datafile/{}".format(
|
||||||
# TODO: Dataverse has a limit for the zipfile size (see
|
host["url"], deep_get(fobj, "dataFile.id")
|
||||||
# https://github.com/jupyter/repo2docker/pull/739#issuecomment-510834729)
|
|
||||||
# If size of the dataset is grater than 100MB individual files should be downloaded.
|
|
||||||
file_ids = [
|
|
||||||
str(deep_get(fobj, "dataFile.id"))
|
|
||||||
for fobj in deep_get(record, "latestVersion.files")
|
|
||||||
]
|
|
||||||
|
|
||||||
req = Request(
|
|
||||||
"{}/api/access/datafiles/{}".format(host["url"], ",".join(file_ids))
|
|
||||||
)
|
)
|
||||||
|
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])
|
||||||
|
|
||||||
dst_fname = os.path.join(output_dir, "dataverse.zip")
|
file_ref = {"download": file_url, "filename": filename}
|
||||||
with self.urlopen(req) as src, open(dst_fname, "wb") as dst:
|
fetch_map = {key: key for key in file_ref.keys()}
|
||||||
yield "Fetching files bundle\n"
|
|
||||||
shutil.copyfileobj(src, dst)
|
|
||||||
|
|
||||||
yield "Extracting files\n"
|
for line in self.fetch_file(file_ref, fetch_map, output_dir):
|
||||||
with ZipFile(dst_fname) as zfile:
|
yield line
|
||||||
zfile.extractall(path=output_dir)
|
|
||||||
|
|
||||||
os.remove(dst_fname)
|
|
||||||
new_subdirs = os.listdir(output_dir)
|
new_subdirs = os.listdir(output_dir)
|
||||||
# if there is only one new subdirectory move its contents
|
# if there is only one new subdirectory move its contents
|
||||||
# to the top level directory
|
# to the top level directory
|
||||||
if len(new_subdirs) == 1:
|
if len(new_subdirs) == 1 and os.path.isdir(new_subdirs[0]):
|
||||||
d = new_subdirs[0]
|
d = new_subdirs[0]
|
||||||
copytree(os.path.join(output_dir, d), output_dir)
|
copytree(os.path.join(output_dir, d), output_dir)
|
||||||
shutil.rmtree(os.path.join(output_dir, d))
|
shutil.rmtree(os.path.join(output_dir, d))
|
||||||
|
|
|
@ -2,12 +2,10 @@ import json
|
||||||
import os
|
import os
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from contextlib import contextmanager
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
from tempfile import TemporaryDirectory
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
from urllib.request import urlopen, Request
|
from urllib.request import urlopen, Request
|
||||||
from zipfile import ZipFile
|
|
||||||
|
|
||||||
from repo2docker.contentproviders import Dataverse
|
from repo2docker.contentproviders import Dataverse
|
||||||
|
|
||||||
|
@ -82,40 +80,55 @@ def test_detect_dataverse(test_input, expected):
|
||||||
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
|
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@pytest.fixture
|
||||||
def dv_archive(prefix="a_directory"):
|
def dv_files(tmpdir):
|
||||||
with NamedTemporaryFile(suffix=".zip") as zfile:
|
|
||||||
with ZipFile(zfile.name, mode="w") as zip:
|
|
||||||
zip.writestr("{}/some-file.txt".format(prefix), "some content")
|
|
||||||
zip.writestr("{}/some-other-file.txt".format(prefix), "some more content")
|
|
||||||
|
|
||||||
yield zfile.name
|
f1 = tmpdir.join("some-file.txt")
|
||||||
|
f1.write("some content")
|
||||||
|
|
||||||
|
f2 = tmpdir.mkdir("directory").join("some-other-file.txt")
|
||||||
|
f2.write("some other content")
|
||||||
|
|
||||||
|
f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt")
|
||||||
|
f3.write("yet another content")
|
||||||
|
|
||||||
|
return [f1, f2, f3]
|
||||||
|
|
||||||
|
|
||||||
def test_dataverse_fetch():
|
def test_dataverse_fetch(dv_files):
|
||||||
mock_response_ds_query = BytesIO(
|
mock_response_ds_query = BytesIO(
|
||||||
json.dumps(
|
json.dumps(
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"latestVersion": {
|
"latestVersion": {
|
||||||
"files": [{"dataFile": {"id": 1}}, {"dataFile": {"id": 2}}]
|
"files": [
|
||||||
|
{"dataFile": {"id": 1}, "label": "some-file.txt"},
|
||||||
|
{
|
||||||
|
"dataFile": {"id": 2},
|
||||||
|
"label": "some-other-file.txt",
|
||||||
|
"directoryLabel": "directory",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataFile": {"id": 3},
|
||||||
|
"label": "the-other-file.txt",
|
||||||
|
"directoryLabel": "directory/subdirectory",
|
||||||
|
},
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).encode("utf-8")
|
).encode("utf-8")
|
||||||
)
|
)
|
||||||
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
|
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
|
||||||
dv = Dataverse()
|
|
||||||
|
|
||||||
with dv_archive() as data_local_path:
|
dv = Dataverse()
|
||||||
|
|
||||||
def mock_urlopen(self, req):
|
def mock_urlopen(self, req):
|
||||||
if isinstance(req, Request):
|
if isinstance(req, Request):
|
||||||
if "/api/datasets" in req.full_url:
|
|
||||||
return mock_response_ds_query
|
return mock_response_ds_query
|
||||||
elif "/api/access/datafiles" in req.full_url:
|
else:
|
||||||
assert req.full_url.endswith("1,2")
|
file_no = int(req.split("/")[-1]) - 1
|
||||||
return urlopen("file://{}".format(data_local_path))
|
return urlopen("file://{}".format(dv_files[file_no]))
|
||||||
|
|
||||||
with patch.object(Dataverse, "urlopen", new=mock_urlopen):
|
with patch.object(Dataverse, "urlopen", new=mock_urlopen):
|
||||||
with TemporaryDirectory() as d:
|
with TemporaryDirectory() as d:
|
||||||
|
@ -124,5 +137,8 @@ def test_dataverse_fetch():
|
||||||
output.append(l)
|
output.append(l)
|
||||||
|
|
||||||
unpacked_files = set(os.listdir(d))
|
unpacked_files = set(os.listdir(d))
|
||||||
expected = set(["some-other-file.txt", "some-file.txt"])
|
expected = set(["directory", "some-file.txt"])
|
||||||
assert expected == unpacked_files
|
assert expected == unpacked_files
|
||||||
|
assert os.path.isfile(
|
||||||
|
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
|
||||||
|
)
|
||||||
|
|
Ładowanie…
Reference in New Issue