download original file formats from Dataverse #1242

Dataverse creates plain-text, preservation-friendly copies of certain file formats (some of which are proprietary, such as Stata or SPSS) and this .tab (tab-separated) file is downloaded unless you supply `format=original`, which is what this pull request does. The original filename (e.g. foo.dta, a Stata file) comes from `originalFileName`, which is only populated when the preservation copy (e.g. foo.tab) has been successfully created. Additional variables were created to distinguish between `filename`, `original_filename`, and `filename_with_path`. If `original_filename` is available, it's the right one to use. To allow the tests to continue passing, the query parameters are now removed so just the file id can be cast as an int.
2023-03-10 16:46:05 -05:00 · 2023-03-10 16:46:05 -05:00 · 938d91c9ca
commit 938d91c9ca
--- a/repo2docker/contentproviders/dataverse.py
+++ b/repo2docker/contentproviders/dataverse.py
@ -102,11 +102,18 @@ class Dataverse(DoiProvider):

        for fobj in deep_get(record, "latestVersion.files"):
            file_url = (
-                f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}'
+                # without format=original you get the preservation format (plain text, tab separated)
+                f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
            )
-            filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])
+            filename = fobj["label"]
+            original_filename = fobj["dataFile"].get("originalFileName", None)
+            if original_filename:
+                # replace preservation format filename (foo.tab) with original filename (foo.dta)
+                filename = original_filename

-            file_ref = {"download": file_url, "filename": filename}
+            filename_with_path = os.path.join(fobj.get("directoryLabel", ""), filename)
+
+            file_ref = {"download": file_url, "filename": filename_with_path}
            fetch_map = {key: key for key in file_ref.keys()}

            yield from self.fetch_file(file_ref, fetch_map, output_dir)
--- a/tests/unit/contentproviders/test_dataverse.py
+++ b/tests/unit/contentproviders/test_dataverse.py
@ -5,6 +5,7 @@ from io import BytesIO
 from tempfile import TemporaryDirectory
 from unittest.mock import patch
 from urllib.request import Request, urlopen
+from urllib.parse import urlsplit

 import pytest

@ -131,7 +132,8 @@ def test_dataverse_fetch(dv_files, requests_mock):
    spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}

    def mock_filecontent(req, context):
-        file_no = int(req.url.split("/")[-1]) - 1
+        parts = urlsplit(req.url)
+        file_no = int(parts.path.split("/")[-1]) - 1
        return open(dv_files[file_no], "rb").read()

    requests_mock.get(