Merge pull request #1390 from yuvipanda/use-api

Use REST APIs to resolve DOIs + cleanup dataverse provider
2024-12-20 08:41:25 +01:00 · 2024-12-20 08:41:25 +01:00 · b7c151536d
commit b7c151536d
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -60,6 +60,7 @@ jobs:
          - r
          - unit
          - venv
          - contentproviders
        include:
          # The actions/setup-python action with Python version 3.6 isn't
          # possible to use with the ubuntu-22.04 runner, so we use ubuntu-20.04
--- a/repo2docker/contentproviders/dataverse.py
+++ b/repo2docker/contentproviders/dataverse.py
@ -1,9 +1,11 @@
 import hashlib
 import json
 import os
 import shutil
-from urllib.parse import parse_qs, urlparse, urlunparse
+from typing import List, Tuple
 from urllib.parse import parse_qs, urlparse
-from ..utils import copytree, deep_get
+from ..utils import copytree, deep_get, is_doi
 from .doi import DoiProvider
@ -23,10 +25,11 @@ class Dataverse(DoiProvider):
            self.hosts = json.load(fp)["installations"]
        super().__init__()
-    def detect(self, doi, ref=None, extra_args=None):
+    def detect(self, spec, ref=None, extra_args=None):
-        """Trigger this provider for things that resolve to a Dataverse dataset.
+        """
        Detect if given spec is hosted on dataverse
-        Handles:
+        The spec can be:
        - DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
        - DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
        - URL {siteURL}/api/access/datafile/{fileId}
@ -35,9 +38,11 @@ class Dataverse(DoiProvider):
        - https://dataverse.harvard.edu/api/access/datafile/3323458
        - doi:10.7910/DVN/6ZXAGT
        - doi:10.7910/DVN/6ZXAGT/3YRRYJ
        """
-        url = self.doi2url(doi)
+        if is_doi(spec):
            url = self.doi2url(spec)
        else:
            url = spec
        # Parse the url, to get the base for later API calls
        parsed_url = urlparse(url)
@ -53,57 +58,137 @@ class Dataverse(DoiProvider):
        if host is None:
            return
-        query_args = parse_qs(parsed_url.query)
+        # At this point, we *know* this is a dataverse URL, because:
-        # Corner case handling
+        # 1. The DOI resolved to a particular host (if using DOI)
-        if parsed_url.path.startswith("/file.xhtml"):
+        # 2. The host is in the list of known dataverse installations
-            # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
+        #
-            # is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
+        # We don't know exactly what kind of dataverse object this is, but
-            new_doi = doi.rsplit("/", 1)[0]
+        # that can be figured out during fetch as needed
-            if new_doi == doi:
+        return url
-                # tough luck :( Avoid inifite recursion and exit.
+
-                return
+    def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str:
-            return self.detect(new_doi)
+        """
-        elif parsed_url.path.startswith("/api/access/datafile"):
+        Return the persistent_id (DOI) of a dataset that a given file_id (int or doi) belongs to
-            # Raw url pointing to a datafile is a typical output from an External Tool integration
+        """
-            entity_id = os.path.basename(parsed_url.path)
+        if file_id.isdigit():
-            search_query = "q=entityId:" + entity_id + "&type=file"
+            # the file_id is an integer, rather than a persistent id (DOI)
-            # Knowing the file identifier query search api to get parent dataset
+            api_url = f"{base_url}/api/files/{file_id}?returnDatasetVersion=true"
-            search_url = urlunparse(
+        else:
-                parsed_url._replace(path="/api/search", query=search_query)
+            # the file_id is a doi itself
            api_url = f"{base_url}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true"
        resp = self._request(api_url)
        if resp.status_code == 404:
            raise ValueError(f"File with id {file_id} not found in {base_url}")
        resp.raise_for_status()
        data = resp.json()["data"]
        return data["datasetVersion"]["datasetPersistentId"]
    def parse_dataverse_url(self, url: str) -> Tuple[str, bool]:
        """
        Parse the persistent id out of a dataverse URL
        persistent_id can point to either a dataset or a file. The second return
        value is False if we know that the persistent id is a file or a dataset,
        and True if it is ambiguous.
        Raises a ValueError if we can not parse the url
        """
        parsed_url = urlparse(url)
        path = parsed_url.path
        qs = parse_qs(parsed_url.query)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        is_ambiguous = False
        # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
        if path.startswith("/citation"):
            is_ambiguous = True
            persistent_id = qs["persistentId"][0]
        # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
        elif path.startswith("/dataset.xhtml"):
            #  https://dataverse.harvard.edu/api/access/datafile/3323458
            persistent_id = qs["persistentId"][0]
        elif path.startswith("/api/access/datafile"):
            # What we have here is an entity id, which we can use to get a persistentId
            file_id = os.path.basename(path)
            persistent_id = self.get_dataset_id_from_file_id(base_url, file_id)
        elif parsed_url.path.startswith("/file.xhtml"):
            file_persistent_id = qs["persistentId"][0]
            persistent_id = self.get_dataset_id_from_file_id(
                base_url, file_persistent_id
            )
        else:
            raise ValueError(
                f"Could not determine persistent id for dataverse URL {url}"
            )
            self.log.debug("Querying Dataverse: " + search_url)
            data = self.urlopen(search_url).json()["data"]
            if data["count_in_response"] != 1:
                self.log.debug(
                    f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n"
                )
                return
-            self.record_id = deep_get(data, "items.0.dataset_persistent_id")
+        return persistent_id, is_ambiguous
        elif (
            parsed_url.path.startswith("/dataset.xhtml")
            and "persistentId" in query_args
        ):
            self.record_id = deep_get(query_args, "persistentId.0")
-        if hasattr(self, "record_id"):
+    def get_datafiles(self, url: str) -> List[dict]:
-            return {"record": self.record_id, "host": host}
+        """
        Return a list of dataFiles for given persistent_id
        Supports the following *dataset* URL styles:
        - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
        - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
        Supports the following *file* URL styles (entire dataset file belongs to will be fetched):
        - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
        - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
        - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
        If a URL can not be parsed, throw an exception
        """
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        persistent_id, is_ambiguous = self.parse_dataverse_url(url)
        dataset_api_url = (
            f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
        )
        resp = self._request(dataset_api_url, headers={"accept": "application/json"})
        if resp.status_code == 404 and is_ambiguous:
            # It's possible this is a *file* persistent_id, not a dataset one
            persistent_id = self.get_dataset_id_from_file_id(base_url, persistent_id)
            dataset_api_url = (
                f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
            )
            resp = self._request(
                dataset_api_url, headers={"accept": "application/json"}
            )
            if resp.status_code == 404:
                # This persistent id is just not here
                raise ValueError(f"{persistent_id} on {base_url} is not found")
        # We already handled 404, raise error for everything else
        resp.raise_for_status()
        # We know the exact persistent_id of the dataset we fetched now
        # Save it for use as content_id
        self.persistent_id = persistent_id
        data = resp.json()["data"]
        return data["latestVersion"]["files"]
    def fetch(self, spec, output_dir, yield_output=False):
        """Fetch and unpack a Dataverse dataset."""
-        record_id = spec["record"]
+        url = spec
-        host = spec["host"]
+        parsed_url = urlparse(url)
        # FIXME: Support determining API URL better
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
-        yield f"Fetching Dataverse record {record_id}.\n"
+        yield f"Fetching Dataverse record {url}.\n"
        url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}'
-        resp = self.urlopen(url, headers={"accept": "application/json"})
+        for fobj in self.get_datafiles(url):
        record = resp.json()["data"]
        for fobj in deep_get(record, "latestVersion.files"):
            file_url = (
                # without format=original you get the preservation format (plain text, tab separated)
-                f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
+                f'{base_url}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
            )
            filename = fobj["label"]
            original_filename = fobj["dataFile"].get("originalFileName", None)
@ -128,5 +213,9 @@ class Dataverse(DoiProvider):
    @property
    def content_id(self):
-        """The Dataverse persistent identifier."""
+        """
-        return self.record_id
+        The Dataverse persistent identifier.
        Only valid if called after a succesfull fetch
        """
        return self.persistent_id
--- a/repo2docker/contentproviders/doi.py
+++ b/repo2docker/contentproviders/doi.py
@ -46,21 +46,28 @@ class DoiProvider(ContentProvider):
        # Transform a DOI to a URL
        # If not a doi, assume we have a URL and return
        if is_doi(doi):
-            doi = normalize_doi(doi)
+            normalized_doi = normalize_doi(doi)
-            try:
+            # Use the doi.org resolver API
-                resp = self._request(f"https://doi.org/{doi}")
+            # documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api
-                resp.raise_for_status()
+            req_url = f"https://doi.org/api/handles/{normalized_doi}"
-            except HTTPError as e:
+            resp = self._request(req_url)
-                # If the DOI doesn't exist, just return URL
+            if resp.status_code == 404:
-                if e.response.status_code == 404:
+                # Not a doi, return what we were passed in
-                    return doi
+                return doi
-                # Reraise any other errors because if the DOI service is down (or
+            elif resp.status_code == 200:
-                # we hit a rate limit) we don't want to silently continue to the
+                data = resp.json()
-                # default Git provider as this leads to a misleading error.
+                # Pick the first URL we find from the doi response
-                self.log.error(f"DOI {doi} does not resolve: {e}")
+                for v in data["values"]:
                    if v["type"] == "URL":
                        return v["data"]["value"]
                # No URLs found for this doi, what do we do?
                self.log.error("DOI {normalized_doi} doesn't point to any URLs")
                return doi
            else:
                # If we get any other status codes, raise error
                raise
            return resp.url
        else:
            # Just return what is actulally just a URL
            return doi
--- a/tests/contentproviders/test_dataverse.py
+++ b/tests/contentproviders/test_dataverse.py
@ -0,0 +1,143 @@
 import hashlib
 import os
 from tempfile import TemporaryDirectory
 import pytest
 from repo2docker.contentproviders import Dataverse
@pytest.mark.parametrize(
    ("doi", "resolved"),
    [
        (
            "doi:10.7910/DVN/6ZXAGT/3YRRYJ",
            "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
        ),
        (
            "10.7910/DVN/6ZXAGT/3YRRYJ",
            "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
        ),
        (
            "10.7910/DVN/TJCLKP",
            "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
        ),
        (
            "https://dataverse.harvard.edu/api/access/datafile/3323458",
            "https://dataverse.harvard.edu/api/access/datafile/3323458",
        ),
        (
            "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
            "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
        ),
        ("/some/random/string", None),
        ("https://example.com/path/here", None),
        # Non dataverse DOIs
        ("https://doi.org/10.21105/joss.01277", None),
    ],
 )
 def test_detect(doi, resolved):
    assert Dataverse().detect(doi) == resolved
@pytest.mark.parametrize(
    ("url", "persistent_id", "is_ambiguous"),
    [
        (
            "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
            "doi:10.7910/DVN/6ZXAGT",
            False,
        ),
        (
            "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
            "doi:10.7910/DVN/TJCLKP",
            True,
        ),
        (
            "https://dataverse.harvard.edu/api/access/datafile/3323458",
            "doi:10.7910/DVN/3MJ7IR",
            False,
        ),
        (
            "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
            "hdl:11529/10016",
            False,
        ),
    ],
 )
 def test_get_persistent_id(url, persistent_id, is_ambiguous):
    assert Dataverse().parse_dataverse_url(url) == (persistent_id, is_ambiguous)
@pytest.mark.parametrize(
    ("specs", "md5tree"),
    [
        (
            (
                "doi:10.7910/DVN/TJCLKP",
                "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
            ),
            {
                "data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694",
                "data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917",
                "code/language.py": "9d61582bcf497c83bbd1ed0eed3c772e",
            },
        ),
        (
            (
                "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
                "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
                "doi:10.7910/DVN/6ZXAGT/3YRRYJ",
            ),
            {
                "ArchaeoGLOBE-master/analysis/figures/1_response_distribution.png": "243c6a3dd66bc3c84102829b277ef333",
                "ArchaeoGLOBE-master/analysis/figures/2_trends_map_knowledge.png": "2ace6ae9d470dda6cf2f9f9a6588171a",
                "ArchaeoGLOBE-master/analysis/figures/3_trends_global.png": "63ccd0a7b2d20440cd8f418d4ee88c4d",
                "ArchaeoGLOBE-master/analysis/figures/4_consensus_transitions.png": "facfaedabeac77c4496d4b9e962a917f",
                "ArchaeoGLOBE-master/analysis/figures/5_ArchaeoGLOBE_HYDE_comparison.png": "8e002e4d50f179fc1808f562b1353588",
                "ArchaeoGLOBE-master/apt.txt": "b4224032da6c71d48f46c9b78fc6ed77",
                "ArchaeoGLOBE-master/analysis/archaeoglobe.pdf": "f575be4790efc963ef1bd40d097cc06d",
                "ArchaeoGLOBE-master/analysis/archaeoglobe.Rmd": "f37d5f7993fde9ebd64d16b20fc22905",
                "ArchaeoGLOBE-master/ArchaeoGLOBE.Rproj": "d0250e7918993bab1e707358fe5633e0",
                "ArchaeoGLOBE-master/CONDUCT.md": "f87ef290340322089c32b4e573d8f1e8",
                "ArchaeoGLOBE-master/.circleci/config.yml": "6eaa54073a682b3195d8fab3a9dd8344",
                "ArchaeoGLOBE-master/CONTRIBUTING.md": "b3a6abfc749dd155a3049f94a855bf9f",
                "ArchaeoGLOBE-master/DESCRIPTION": "745ef979494999e483987de72c0adfbd",
                "ArchaeoGLOBE-master/dockerfile": "aedce68e5a7d6e79cbb24c9cffeae593",
                "ArchaeoGLOBE-master/.binder/Dockerfile": "7564a41246ba99b60144afb1d3b6d7de",
                "ArchaeoGLOBE-master/.gitignore": "62c1482e4febbd35dc02fb7e2a31246b",
                "ArchaeoGLOBE-master/analysis/data/derived-data/hyde_crop_prop.RDS": "2aea7748b5586923b0de9d13af58e59d",
                "ArchaeoGLOBE-master/analysis/data/derived-data/kk_anthro_prop.RDS": "145a9e5dd2c95625626a720b52178b70",
                "ArchaeoGLOBE-master/LICENSE.md": "3aa9d41a92a57944bd4590e004898445",
                "ArchaeoGLOBE-master/analysis/data/derived-data/placeholder": "d41d8cd98f00b204e9800998ecf8427e",
                "ArchaeoGLOBE-master/.Rbuildignore": "df15e4fed49abd685b536fef4472b01f",
                "ArchaeoGLOBE-master/README.md": "0b0faabe580c4d76a0e0d64a4f54bca4",
                "ArchaeoGLOBE-master/analysis/data/derived-data/README.md": "547fd1a6e874f6178b1cf525b5b9ae72",
                "ArchaeoGLOBE-master/analysis/figures/S1_FHG_consensus.png": "d2584352e5442b33e4b23e361ca70fe1",
                "ArchaeoGLOBE-master/analysis/figures/S2_EXAG_consensus.png": "513eddfdad01fd01a20263a55ca6dbe3",
                "ArchaeoGLOBE-master/analysis/figures/S3_INAG_consensus.png": "b16ba0ecd21b326f873209a7e55a8deb",
                "ArchaeoGLOBE-master/analysis/figures/S4_PAS_consensus.png": "05695f9412337a00c1cb6d1757d0ec5c",
                "ArchaeoGLOBE-master/analysis/figures/S5_URBAN_consensus.png": "10119f7495d3b8e7ad7f8a0770574f15",
                "ArchaeoGLOBE-master/analysis/figures/S6_trends_map_landuse.png": "b1db7c97f39ccfc3a9e094c3e6307af0",
                "ArchaeoGLOBE-master/analysis/figures/S7_ArchaeoGLOBE_KK10_comparison.png": "30341748324f5f66acadb34c114c3e9d",
            },
        ),
    ],
 )
 def test_fetch(specs: list[str], md5tree):
    dv = Dataverse()
    for spec in specs:
        with TemporaryDirectory() as d:
            output = []
            for l in dv.fetch(dv.detect(spec), d):
                output.append(l)
            # Verify md5 sum of the files we expect to find
            # We are using md5 instead of something more secure because that is what
            # dataverse itself uses
            for subpath, expected_sha in md5tree.items():
                with open(os.path.join(d, subpath), "rb") as f:
                    h = hashlib.md5()
                    h.update(f.read())
                    assert h.hexdigest() == expected_sha
--- a/tests/unit/contentproviders/test_dataverse.py
+++ b/tests/unit/contentproviders/test_dataverse.py
@ -1,160 +0,0 @@
 import json
 import os
 import re
 from io import BytesIO
 from tempfile import TemporaryDirectory
 from unittest.mock import patch
 from urllib.parse import urlsplit
 from urllib.request import Request, urlopen
 import pytest
 from repo2docker.contentproviders import Dataverse
 test_dv = Dataverse()
 harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")
 cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")
 test_hosts = [
    (
        [
            "doi:10.7910/DVN/6ZXAGT/3YRRYJ",
            "10.7910/DVN/6ZXAGT",
            "https://dataverse.harvard.edu/api/access/datafile/3323458",
            "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
        ],
        [
            {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"},
            {"host": cimmyt_dv, "record": "hdl:11529/10016"},
        ],
    )
 ]
 doi_responses = {
    "https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": (
        "https://dataverse.harvard.edu/file.xhtml"
        "?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"
    ),
    "https://doi.org/10.7910/DVN/6ZXAGT": (
        "https://dataverse.harvard.edu/dataset.xhtml"
        "?persistentId=doi:10.7910/DVN/6ZXAGT"
    ),
    "https://dataverse.harvard.edu/api/access/datafile/3323458": (
        "https://dataverse.harvard.edu/api/access/datafile/3323458"
    ),
    "https://doi.org/10.21105/joss.01277": (
        "https://joss.theoj.org/papers/10.21105/joss.01277"
    ),
 }
@pytest.mark.parametrize("test_input, expected", test_hosts)
 def test_detect_dataverse(test_input, expected, requests_mock):
    def doi_resolver(req, context):
        resp = doi_responses.get(req.url)
        # doi responses are redirects
        if resp is not None:
            context.status_code = 302
            context.headers["Location"] = resp
        return resp
    requests_mock.get(re.compile("https://"), json=doi_resolver)
    requests_mock.get(
        "https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file",
        json={
            "data": {
                "count_in_response": 1,
                "items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
            }
        },
    )
    assert requests_mock.call_count == 0
    # valid Dataverse DOIs trigger this content provider
    assert Dataverse().detect(test_input[0]) == expected[0]
    # 4: doi resolution (302), File, doi resolution (302), then dataset
    assert requests_mock.call_count == 4
    requests_mock.reset_mock()
    assert Dataverse().detect(test_input[1]) == expected[0]
    # 2: doi (302), dataset
    assert requests_mock.call_count == 2
    requests_mock.reset_mock()
    assert Dataverse().detect(test_input[2]) == expected[0]
    # 1: datafile (search dataverse for the file id)
    assert requests_mock.call_count == 1
    requests_mock.reset_mock()
    assert Dataverse().detect(test_input[3]) == expected[1]
    requests_mock.reset_mock()
    # Don't trigger the Dataverse content provider
    assert Dataverse().detect("/some/path/here") is None
    assert Dataverse().detect("https://example.com/path/here") is None
    # don't handle DOIs that aren't from Dataverse
    assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
@pytest.fixture
 def dv_files(tmpdir):
    f1 = tmpdir.join("some-file.txt")
    f1.write("some content")
    f2 = tmpdir.mkdir("directory").join("some-other-file.txt")
    f2.write("some other content")
    f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt")
    f3.write("yet another content")
    return [f1, f2, f3]
 def test_dataverse_fetch(dv_files, requests_mock):
    mock_response = {
        "data": {
            "latestVersion": {
                "files": [
                    {"dataFile": {"id": 1}, "label": "some-file.txt"},
                    {
                        "dataFile": {"id": 2},
                        "label": "some-other-file.txt",
                        "directoryLabel": "directory",
                    },
                    {
                        "dataFile": {"id": 3},
                        "label": "the-other-file.txt",
                        "directoryLabel": "directory/subdirectory",
                    },
                ]
            }
        }
    }
    spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
    def mock_filecontent(req, context):
        parts = urlsplit(req.url)
        file_no = int(parts.path.split("/")[-1]) - 1
        return open(dv_files[file_no], "rb").read()
    requests_mock.get(
        "https://dataverse.harvard.edu/api/datasets/"
        ":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT",
        json=mock_response,
    )
    requests_mock.get(
        re.compile("https://dataverse.harvard.edu/api/access/datafile"),
        content=mock_filecontent,
    )
    dv = Dataverse()
    with TemporaryDirectory() as d:
        output = []
        for l in dv.fetch(spec, d):
            output.append(l)
        unpacked_files = set(os.listdir(d))
        expected = {"directory", "some-file.txt"}
        assert expected == unpacked_files
        assert os.path.isfile(
            os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
        )
--- a/tests/unit/contentproviders/test_doi.py
+++ b/tests/unit/contentproviders/test_doi.py
@ -33,7 +33,7 @@ def test_url_headers(requests_mock):
@pytest.mark.parametrize(
    "requested_doi, expected",
    [
-        ("10.5281/zenodo.3242074", "https://zenodo.org/records/3242074"),
+        ("10.5281/zenodo.3242074", "https://zenodo.org/record/3242074"),
        # Unresolving DOI:
        ("10.1/1234", "10.1/1234"),
    ],
--- a/tests/unit/contentproviders/test_figshare.py
+++ b/tests/unit/contentproviders/test_figshare.py
@ -21,16 +21,9 @@ test_content_ids = [
@pytest.mark.parametrize("link,expected", test_content_ids)
-def test_content_id(link, expected, requests_mock):
+def test_content_id(link, expected):
    def mocked_get(req, context):
        if req.url.startswith("https://doi.org"):
            context.status_code = 302
            context.headers["Location"] = link
        return link
    requests_mock.get(re.compile("https://"), text=mocked_get)
    fig = Figshare()
-    fig.detect("10.6084/m9.figshare.9782777")
+    fig.detect(link)
    assert fig.content_id == expected
--- a/tests/unit/contentproviders/test_hydroshare.py
+++ b/tests/unit/contentproviders/test_hydroshare.py
@ -34,20 +34,14 @@ hydroshare_data = {
 }
-def test_content_id(requests_mock):
+def test_content_id():
    requests_mock.get(re.compile("https://"), json=hydroshare_data)
    requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
    hydro = Hydroshare()
    hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
-    assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
+    assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1585005408"
-def test_detect_hydroshare(requests_mock):
+def test_detect_hydroshare():
    requests_mock.get(re.compile("https://"), json=hydroshare_data)
    requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
    # valid Hydroshare DOIs trigger this content provider
    expected = {
        "host": {
@ -59,7 +53,7 @@ def test_detect_hydroshare(requests_mock):
            "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
        },
        "resource": "b8f6eae9d89241cf8b5904033460af61",
-        "version": "1569427757",
+        "version": "1585005408",
    }
    assert (
@ -68,16 +62,10 @@ def test_detect_hydroshare(requests_mock):
        )
        == expected
    )
    # assert a call to urlopen was called to fetch version
    assert requests_mock.call_count == 1
    requests_mock.reset_mock()
    assert (
        Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected
    )
    # assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version
    assert requests_mock.call_count == 3
    requests_mock.reset_mock()
    assert (
        Hydroshare().detect(
@ -85,9 +73,6 @@ def test_detect_hydroshare(requests_mock):
        )
        == expected
    )
    # assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version
    assert requests_mock.call_count == 3
    requests_mock.reset_mock()
    # Don't trigger the Hydroshare content provider
    assert Hydroshare().detect("/some/path/here") is None
--- a/tests/unit/contentproviders/test_zenodo.py
+++ b/tests/unit/contentproviders/test_zenodo.py
@ -21,18 +21,7 @@ doi_responses = {
 }
-def doi_resolver(req, context):
+def test_content_id():
    resp = doi_responses.get(req.url)
    # doi responses are redirects
    if resp is not None:
        context.status_code = 302
        context.headers["Location"] = resp
    return resp
 def test_content_id(requests_mock):
    requests_mock.get(re.compile("https://"), json=doi_resolver)
    zen = Zenodo()
    zen.detect("10.5281/zenodo.3232985")
    assert zen.content_id == "3232985"
@ -60,15 +49,11 @@ test_hosts = [
@pytest.mark.parametrize("test_input,expected", test_hosts)
-def test_detect_zenodo(test_input, expected, requests_mock):
+def test_detect_zenodo(test_input, expected):
    requests_mock.get(re.compile("https://"), json=doi_resolver)
    # valid Zenodo DOIs trigger this content provider
    assert Zenodo().detect(test_input[0]) == expected
    assert Zenodo().detect(test_input[1]) == expected
    assert Zenodo().detect(test_input[2]) == expected
    # only two of the three calls above have to resolve a DOI (2 req per doi resolution)
    assert requests_mock.call_count == 4
    requests_mock.reset_mock()
    # Don't trigger the Zenodo content provider
    assert Zenodo().detect("/some/path/here") is None