kopia lustrzana https://github.com/jupyterhub/repo2docker
download original file formats from Dataverse #1242
Dataverse creates plain-text, preservation-friendly copies of certain file formats (some of which are proprietary, such as Stata or SPSS) and this .tab (tab-separated) file is downloaded unless you supply `format=original`, which is what this pull request does. The original filename (e.g. foo.dta, a Stata file) comes from `originalFileName`, which is only populated when the preservation copy (e.g. foo.tab) has been successfully created. Additional variables were created to distinguish between `filename`, `original_filename`, and `filename_with_path`. If `original_filename` is available, it's the right one to use. To allow the tests to continue passing, the query parameters are now removed so just the file id can be cast as an int.pull/1253/head
rodzic
21fa80fad6
commit
938d91c9ca
|
@ -102,11 +102,18 @@ class Dataverse(DoiProvider):
|
||||||
|
|
||||||
for fobj in deep_get(record, "latestVersion.files"):
|
for fobj in deep_get(record, "latestVersion.files"):
|
||||||
file_url = (
|
file_url = (
|
||||||
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}'
|
# without format=original you get the preservation format (plain text, tab separated)
|
||||||
|
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
|
||||||
)
|
)
|
||||||
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])
|
filename = fobj["label"]
|
||||||
|
original_filename = fobj["dataFile"].get("originalFileName", None)
|
||||||
|
if original_filename:
|
||||||
|
# replace preservation format filename (foo.tab) with original filename (foo.dta)
|
||||||
|
filename = original_filename
|
||||||
|
|
||||||
file_ref = {"download": file_url, "filename": filename}
|
filename_with_path = os.path.join(fobj.get("directoryLabel", ""), filename)
|
||||||
|
|
||||||
|
file_ref = {"download": file_url, "filename": filename_with_path}
|
||||||
fetch_map = {key: key for key in file_ref.keys()}
|
fetch_map = {key: key for key in file_ref.keys()}
|
||||||
|
|
||||||
yield from self.fetch_file(file_ref, fetch_map, output_dir)
|
yield from self.fetch_file(file_ref, fetch_map, output_dir)
|
||||||
|
|
|
@ -5,6 +5,7 @@ from io import BytesIO
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import Request, urlopen
|
||||||
|
from urllib.parse import urlsplit
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -131,7 +132,8 @@ def test_dataverse_fetch(dv_files, requests_mock):
|
||||||
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
|
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
|
||||||
|
|
||||||
def mock_filecontent(req, context):
|
def mock_filecontent(req, context):
|
||||||
file_no = int(req.url.split("/")[-1]) - 1
|
parts = urlsplit(req.url)
|
||||||
|
file_no = int(parts.path.split("/")[-1]) - 1
|
||||||
return open(dv_files[file_no], "rb").read()
|
return open(dv_files[file_no], "rb").read()
|
||||||
|
|
||||||
requests_mock.get(
|
requests_mock.get(
|
||||||
|
|
Ładowanie…
Reference in New Issue