From 4df4fd6ab41a44b994f2ea8664ede9df89220020 Mon Sep 17 00:00:00 2001 From: "Kacper Kowalik (Xarthisius)" Date: Tue, 17 Sep 2019 15:44:56 -0500 Subject: [PATCH] Download individual files instead of zip bundle --- repo2docker/contentproviders/dataverse.py | 33 +++------ tests/unit/contentproviders/test_dataverse.py | 74 +++++++++++-------- 2 files changed, 55 insertions(+), 52 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 79133757..3aa67994 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -4,7 +4,6 @@ import shutil from urllib.request import Request from urllib.parse import urlparse, urlunparse, parse_qs -from zipfile import ZipFile from .doi import DoiProvider from ..utils import copytree, deep_get @@ -104,34 +103,22 @@ class Dataverse(DoiProvider): resp = self.urlopen(req) record = json.loads(resp.read().decode("utf-8"))["data"] - # In order to fetch entire dataset we build a list of file IDs we want to fetch - # and then receive a zip file containing all of them. - # TODO: Dataverse has a limit for the zipfile size (see - # https://github.com/jupyter/repo2docker/pull/739#issuecomment-510834729) - # If size of the dataset is grater than 100MB individual files should be downloaded. - file_ids = [ - str(deep_get(fobj, "dataFile.id")) - for fobj in deep_get(record, "latestVersion.files") - ] + for fobj in deep_get(record, "latestVersion.files"): + file_url = "{}/api/access/datafile/{}".format( + host["url"], deep_get(fobj, "dataFile.id") + ) + filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"]) - req = Request( - "{}/api/access/datafiles/{}".format(host["url"], ",".join(file_ids)) - ) + file_ref = {"download": file_url, "filename": filename} + fetch_map = {key: key for key in file_ref.keys()} - dst_fname = os.path.join(output_dir, "dataverse.zip") - with self.urlopen(req) as src, open(dst_fname, "wb") as dst: - yield "Fetching files bundle\n" - shutil.copyfileobj(src, dst) + for line in self.fetch_file(file_ref, fetch_map, output_dir): + yield line - yield "Extracting files\n" - with ZipFile(dst_fname) as zfile: - zfile.extractall(path=output_dir) - - os.remove(dst_fname) new_subdirs = os.listdir(output_dir) # if there is only one new subdirectory move its contents # to the top level directory - if len(new_subdirs) == 1: + if len(new_subdirs) == 1 and os.path.isdir(new_subdirs[0]): d = new_subdirs[0] copytree(os.path.join(output_dir, d), output_dir) shutil.rmtree(os.path.join(output_dir, d)) diff --git a/tests/unit/contentproviders/test_dataverse.py b/tests/unit/contentproviders/test_dataverse.py index 69ab9917..76d39654 100644 --- a/tests/unit/contentproviders/test_dataverse.py +++ b/tests/unit/contentproviders/test_dataverse.py @@ -2,12 +2,10 @@ import json import os import pytest -from contextlib import contextmanager from io import BytesIO -from tempfile import TemporaryDirectory, NamedTemporaryFile +from tempfile import TemporaryDirectory from unittest.mock import patch from urllib.request import urlopen, Request -from zipfile import ZipFile from repo2docker.contentproviders import Dataverse @@ -82,47 +80,65 @@ def test_detect_dataverse(test_input, expected): assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None -@contextmanager -def dv_archive(prefix="a_directory"): - with NamedTemporaryFile(suffix=".zip") as zfile: - with ZipFile(zfile.name, mode="w") as zip: - zip.writestr("{}/some-file.txt".format(prefix), "some content") - zip.writestr("{}/some-other-file.txt".format(prefix), "some more content") +@pytest.fixture +def dv_files(tmpdir): - yield zfile.name + f1 = tmpdir.join("some-file.txt") + f1.write("some content") + + f2 = tmpdir.mkdir("directory").join("some-other-file.txt") + f2.write("some other content") + + f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt") + f3.write("yet another content") + + return [f1, f2, f3] -def test_dataverse_fetch(): +def test_dataverse_fetch(dv_files): mock_response_ds_query = BytesIO( json.dumps( { "data": { "latestVersion": { - "files": [{"dataFile": {"id": 1}}, {"dataFile": {"id": 2}}] + "files": [ + {"dataFile": {"id": 1}, "label": "some-file.txt"}, + { + "dataFile": {"id": 2}, + "label": "some-other-file.txt", + "directoryLabel": "directory", + }, + { + "dataFile": {"id": 3}, + "label": "the-other-file.txt", + "directoryLabel": "directory/subdirectory", + }, + ] } } } ).encode("utf-8") ) spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} + dv = Dataverse() - with dv_archive() as data_local_path: + def mock_urlopen(self, req): + if isinstance(req, Request): + return mock_response_ds_query + else: + file_no = int(req.split("/")[-1]) - 1 + return urlopen("file://{}".format(dv_files[file_no])) - def mock_urlopen(self, req): - if isinstance(req, Request): - if "/api/datasets" in req.full_url: - return mock_response_ds_query - elif "/api/access/datafiles" in req.full_url: - assert req.full_url.endswith("1,2") - return urlopen("file://{}".format(data_local_path)) + with patch.object(Dataverse, "urlopen", new=mock_urlopen): + with TemporaryDirectory() as d: + output = [] + for l in dv.fetch(spec, d): + output.append(l) - with patch.object(Dataverse, "urlopen", new=mock_urlopen): - with TemporaryDirectory() as d: - output = [] - for l in dv.fetch(spec, d): - output.append(l) - - unpacked_files = set(os.listdir(d)) - expected = set(["some-other-file.txt", "some-file.txt"]) - assert expected == unpacked_files + unpacked_files = set(os.listdir(d)) + expected = set(["directory", "some-file.txt"]) + assert expected == unpacked_files + assert os.path.isfile( + os.path.join(d, "directory", "subdirectory", "the-other-file.txt") + )