download original file formats from Dataverse #1242

Dataverse creates plain-text, preservation-friendly copies of certain
file formats (some of which are proprietary, such as Stata or SPSS) and
this .tab (tab-separated) file is downloaded unless you supply
`format=original`, which is what this pull request does.

The original filename (e.g. foo.dta, a Stata file) comes from
`originalFileName`, which is only populated when the preservation copy
(e.g. foo.tab) has been successfully created.

Additional variables were created to distinguish between `filename`,
`original_filename`, and `filename_with_path`. If `original_filename`
is available, it's the right one to use.

To allow the tests to continue passing, the query parameters are now
removed so just the file id can be cast as an int.
pull/1253/head
Philip Durbin 2023-03-10 16:46:05 -05:00
rodzic 21fa80fad6
commit 938d91c9ca
2 zmienionych plików z 13 dodań i 4 usunięć

Wyświetl plik

@ -102,11 +102,18 @@ class Dataverse(DoiProvider):
for fobj in deep_get(record, "latestVersion.files"):
file_url = (
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}'
# without format=original you get the preservation format (plain text, tab separated)
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
)
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])
filename = fobj["label"]
original_filename = fobj["dataFile"].get("originalFileName", None)
if original_filename:
# replace preservation format filename (foo.tab) with original filename (foo.dta)
filename = original_filename
file_ref = {"download": file_url, "filename": filename}
filename_with_path = os.path.join(fobj.get("directoryLabel", ""), filename)
file_ref = {"download": file_url, "filename": filename_with_path}
fetch_map = {key: key for key in file_ref.keys()}
yield from self.fetch_file(file_ref, fetch_map, output_dir)

Wyświetl plik

@ -5,6 +5,7 @@ from io import BytesIO
from tempfile import TemporaryDirectory
from unittest.mock import patch
from urllib.request import Request, urlopen
from urllib.parse import urlsplit
import pytest
@ -131,7 +132,8 @@ def test_dataverse_fetch(dv_files, requests_mock):
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
def mock_filecontent(req, context):
file_no = int(req.url.split("/")[-1]) - 1
parts = urlsplit(req.url)
file_no = int(parts.path.split("/")[-1]) - 1
return open(dv_files[file_no], "rb").read()
requests_mock.get(