download original file formats from Dataverse #1242

Dataverse creates plain-text, preservation-friendly copies of certain
file formats (some of which are proprietary, such as Stata or SPSS) and
this .tab (tab-separated) file is downloaded unless you supply
`format=original`, which is what this pull request does.

The original filename (e.g. foo.dta, a Stata file) comes from
`originalFileName`, which is only populated when the preservation copy
(e.g. foo.tab) has been successfully created.

Additional variables were created to distinguish between `filename`,
`original_filename`, and `filename_with_path`. If `original_filename`
is available, it's the right one to use.

To allow the tests to continue passing, the query parameters are now
removed so just the file id can be cast as an int.
pull/1253/head
Philip Durbin 2023-03-10 16:46:05 -05:00
rodzic 21fa80fad6
commit 938d91c9ca
2 zmienionych plików z 13 dodań i 4 usunięć

Wyświetl plik

@ -102,11 +102,18 @@ class Dataverse(DoiProvider):
for fobj in deep_get(record, "latestVersion.files"): for fobj in deep_get(record, "latestVersion.files"):
file_url = ( file_url = (
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}' # without format=original you get the preservation format (plain text, tab separated)
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
) )
filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"]) filename = fobj["label"]
original_filename = fobj["dataFile"].get("originalFileName", None)
if original_filename:
# replace preservation format filename (foo.tab) with original filename (foo.dta)
filename = original_filename
file_ref = {"download": file_url, "filename": filename} filename_with_path = os.path.join(fobj.get("directoryLabel", ""), filename)
file_ref = {"download": file_url, "filename": filename_with_path}
fetch_map = {key: key for key in file_ref.keys()} fetch_map = {key: key for key in file_ref.keys()}
yield from self.fetch_file(file_ref, fetch_map, output_dir) yield from self.fetch_file(file_ref, fetch_map, output_dir)

Wyświetl plik

@ -5,6 +5,7 @@ from io import BytesIO
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from unittest.mock import patch from unittest.mock import patch
from urllib.request import Request, urlopen from urllib.request import Request, urlopen
from urllib.parse import urlsplit
import pytest import pytest
@ -131,7 +132,8 @@ def test_dataverse_fetch(dv_files, requests_mock):
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"} spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
def mock_filecontent(req, context): def mock_filecontent(req, context):
file_no = int(req.url.split("/")[-1]) - 1 parts = urlsplit(req.url)
file_no = int(parts.path.split("/")[-1]) - 1
return open(dv_files[file_no], "rb").read() return open(dv_files[file_no], "rb").read()
requests_mock.get( requests_mock.get(