kopia lustrzana https://github.com/jupyterhub/repo2docker
download original file formats from Dataverse #1242
Dataverse creates plain-text, preservation-friendly copies of certain file formats (some of which are proprietary, such as Stata or SPSS) and this .tab (tab-separated) file is downloaded unless you supply `format=original`, which is what this pull request does. The original filename (e.g. foo.dta, a Stata file) comes from `originalFileName`, which is only populated when the preservation copy (e.g. foo.tab) has been successfully created. Additional variables were created to distinguish between `filename`, `original_filename`, and `filename_with_path`. If `original_filename` is available, it's the right one to use. To allow the tests to continue passing, the query parameters are now removed so just the file id can be cast as an int.pull/1253/head
rodzic
21fa80fad6
commit
938d91c9ca
|
@ -102,11 +102,18 @@ class Dataverse(DoiProvider):
|
|||
|
||||
for fobj in deep_get(record, "latestVersion.files"):
|
||||
file_url = (
|
||||
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}'
|
||||
# without format=original you get the preservation format (plain text, tab separated)
|
||||
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
|
||||
)
|
||||
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])
|
||||
filename = fobj["label"]
|
||||
original_filename = fobj["dataFile"].get("originalFileName", None)
|
||||
if original_filename:
|
||||
# replace preservation format filename (foo.tab) with original filename (foo.dta)
|
||||
filename = original_filename
|
||||
|
||||
file_ref = {"download": file_url, "filename": filename}
|
||||
filename_with_path = os.path.join(fobj.get("directoryLabel", ""), filename)
|
||||
|
||||
file_ref = {"download": file_url, "filename": filename_with_path}
|
||||
fetch_map = {key: key for key in file_ref.keys()}
|
||||
|
||||
yield from self.fetch_file(file_ref, fetch_map, output_dir)
|
||||
|
|
|
@ -5,6 +5,7 @@ from io import BytesIO
|
|||
from tempfile import TemporaryDirectory
|
||||
from unittest.mock import patch
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -131,7 +132,8 @@ def test_dataverse_fetch(dv_files, requests_mock):
|
|||
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
|
||||
|
||||
def mock_filecontent(req, context):
|
||||
file_no = int(req.url.split("/")[-1]) - 1
|
||||
parts = urlsplit(req.url)
|
||||
file_no = int(parts.path.split("/")[-1]) - 1
|
||||
return open(dv_files[file_no], "rb").read()
|
||||
|
||||
requests_mock.get(
|
||||
|
|
Ładowanie…
Reference in New Issue