diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 1998f6be..9054f53c 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -102,11 +102,18 @@ class Dataverse(DoiProvider): for fobj in deep_get(record, "latestVersion.files"): file_url = ( - f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}' + # without format=original you get the preservation format (plain text, tab separated) + f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' ) - filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"]) + filename = fobj["label"] + original_filename = fobj["dataFile"].get("originalFileName", None) + if original_filename: + # replace preservation format filename (foo.tab) with original filename (foo.dta) + filename = original_filename - file_ref = {"download": file_url, "filename": filename} + filename_with_path = os.path.join(fobj.get("directoryLabel", ""), filename) + + file_ref = {"download": file_url, "filename": filename_with_path} fetch_map = {key: key for key in file_ref.keys()} yield from self.fetch_file(file_ref, fetch_map, output_dir) diff --git a/tests/unit/contentproviders/test_dataverse.py b/tests/unit/contentproviders/test_dataverse.py index 1603b9cd..c5c0d1cb 100644 --- a/tests/unit/contentproviders/test_dataverse.py +++ b/tests/unit/contentproviders/test_dataverse.py @@ -5,6 +5,7 @@ from io import BytesIO from tempfile import TemporaryDirectory from unittest.mock import patch from urllib.request import Request, urlopen +from urllib.parse import urlsplit import pytest @@ -131,7 +132,8 @@ def test_dataverse_fetch(dv_files, requests_mock): spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} def mock_filecontent(req, context): - file_no = int(req.url.split("/")[-1]) - 1 + parts = urlsplit(req.url) + file_no = int(parts.path.split("/")[-1]) - 1 return open(dv_files[file_no], "rb").read() requests_mock.get(