Merge pull request #1390 from yuvipanda/use-api

Use REST APIs to resolve DOIs + cleanup dataverse provider
pull/1393/merge
Min RK 2024-12-20 08:41:25 +01:00 zatwierdzone przez GitHub
commit b7c151536d
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
9 zmienionych plików z 312 dodań i 269 usunięć

Wyświetl plik

@ -60,6 +60,7 @@ jobs:
- r - r
- unit - unit
- venv - venv
- contentproviders
include: include:
# The actions/setup-python action with Python version 3.6 isn't # The actions/setup-python action with Python version 3.6 isn't
# possible to use with the ubuntu-22.04 runner, so we use ubuntu-20.04 # possible to use with the ubuntu-22.04 runner, so we use ubuntu-20.04

Wyświetl plik

@ -1,9 +1,11 @@
import hashlib
import json import json
import os import os
import shutil import shutil
from urllib.parse import parse_qs, urlparse, urlunparse from typing import List, Tuple
from urllib.parse import parse_qs, urlparse
from ..utils import copytree, deep_get from ..utils import copytree, deep_get, is_doi
from .doi import DoiProvider from .doi import DoiProvider
@ -23,10 +25,11 @@ class Dataverse(DoiProvider):
self.hosts = json.load(fp)["installations"] self.hosts = json.load(fp)["installations"]
super().__init__() super().__init__()
def detect(self, doi, ref=None, extra_args=None): def detect(self, spec, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Dataverse dataset. """
Detect if given spec is hosted on dataverse
Handles: The spec can be:
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId} - DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&... - DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
- URL {siteURL}/api/access/datafile/{fileId} - URL {siteURL}/api/access/datafile/{fileId}
@ -35,9 +38,11 @@ class Dataverse(DoiProvider):
- https://dataverse.harvard.edu/api/access/datafile/3323458 - https://dataverse.harvard.edu/api/access/datafile/3323458
- doi:10.7910/DVN/6ZXAGT - doi:10.7910/DVN/6ZXAGT
- doi:10.7910/DVN/6ZXAGT/3YRRYJ - doi:10.7910/DVN/6ZXAGT/3YRRYJ
""" """
url = self.doi2url(doi) if is_doi(spec):
url = self.doi2url(spec)
else:
url = spec
# Parse the url, to get the base for later API calls # Parse the url, to get the base for later API calls
parsed_url = urlparse(url) parsed_url = urlparse(url)
@ -53,57 +58,137 @@ class Dataverse(DoiProvider):
if host is None: if host is None:
return return
query_args = parse_qs(parsed_url.query) # At this point, we *know* this is a dataverse URL, because:
# Corner case handling # 1. The DOI resolved to a particular host (if using DOI)
if parsed_url.path.startswith("/file.xhtml"): # 2. The host is in the list of known dataverse installations
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi #
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way. # We don't know exactly what kind of dataverse object this is, but
new_doi = doi.rsplit("/", 1)[0] # that can be figured out during fetch as needed
if new_doi == doi: return url
# tough luck :( Avoid inifite recursion and exit.
return def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str:
return self.detect(new_doi) """
elif parsed_url.path.startswith("/api/access/datafile"): Return the persistent_id (DOI) of a dataset that a given file_id (int or doi) belongs to
# Raw url pointing to a datafile is a typical output from an External Tool integration """
entity_id = os.path.basename(parsed_url.path) if file_id.isdigit():
search_query = "q=entityId:" + entity_id + "&type=file" # the file_id is an integer, rather than a persistent id (DOI)
# Knowing the file identifier query search api to get parent dataset api_url = f"{base_url}/api/files/{file_id}?returnDatasetVersion=true"
search_url = urlunparse( else:
parsed_url._replace(path="/api/search", query=search_query) # the file_id is a doi itself
api_url = f"{base_url}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true"
resp = self._request(api_url)
if resp.status_code == 404:
raise ValueError(f"File with id {file_id} not found in {base_url}")
resp.raise_for_status()
data = resp.json()["data"]
return data["datasetVersion"]["datasetPersistentId"]
def parse_dataverse_url(self, url: str) -> Tuple[str, bool]:
"""
Parse the persistent id out of a dataverse URL
persistent_id can point to either a dataset or a file. The second return
value is False if we know that the persistent id is a file or a dataset,
and True if it is ambiguous.
Raises a ValueError if we can not parse the url
"""
parsed_url = urlparse(url)
path = parsed_url.path
qs = parse_qs(parsed_url.query)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
is_ambiguous = False
# https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
if path.startswith("/citation"):
is_ambiguous = True
persistent_id = qs["persistentId"][0]
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
elif path.startswith("/dataset.xhtml"):
# https://dataverse.harvard.edu/api/access/datafile/3323458
persistent_id = qs["persistentId"][0]
elif path.startswith("/api/access/datafile"):
# What we have here is an entity id, which we can use to get a persistentId
file_id = os.path.basename(path)
persistent_id = self.get_dataset_id_from_file_id(base_url, file_id)
elif parsed_url.path.startswith("/file.xhtml"):
file_persistent_id = qs["persistentId"][0]
persistent_id = self.get_dataset_id_from_file_id(
base_url, file_persistent_id
)
else:
raise ValueError(
f"Could not determine persistent id for dataverse URL {url}"
) )
self.log.debug("Querying Dataverse: " + search_url)
data = self.urlopen(search_url).json()["data"]
if data["count_in_response"] != 1:
self.log.debug(
f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n"
)
return
self.record_id = deep_get(data, "items.0.dataset_persistent_id") return persistent_id, is_ambiguous
elif (
parsed_url.path.startswith("/dataset.xhtml")
and "persistentId" in query_args
):
self.record_id = deep_get(query_args, "persistentId.0")
if hasattr(self, "record_id"): def get_datafiles(self, url: str) -> List[dict]:
return {"record": self.record_id, "host": host} """
Return a list of dataFiles for given persistent_id
Supports the following *dataset* URL styles:
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
- /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
Supports the following *file* URL styles (entire dataset file belongs to will be fetched):
- /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
- /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
If a URL can not be parsed, throw an exception
"""
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
persistent_id, is_ambiguous = self.parse_dataverse_url(url)
dataset_api_url = (
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
)
resp = self._request(dataset_api_url, headers={"accept": "application/json"})
if resp.status_code == 404 and is_ambiguous:
# It's possible this is a *file* persistent_id, not a dataset one
persistent_id = self.get_dataset_id_from_file_id(base_url, persistent_id)
dataset_api_url = (
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
)
resp = self._request(
dataset_api_url, headers={"accept": "application/json"}
)
if resp.status_code == 404:
# This persistent id is just not here
raise ValueError(f"{persistent_id} on {base_url} is not found")
# We already handled 404, raise error for everything else
resp.raise_for_status()
# We know the exact persistent_id of the dataset we fetched now
# Save it for use as content_id
self.persistent_id = persistent_id
data = resp.json()["data"]
return data["latestVersion"]["files"]
def fetch(self, spec, output_dir, yield_output=False): def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Dataverse dataset.""" """Fetch and unpack a Dataverse dataset."""
record_id = spec["record"] url = spec
host = spec["host"] parsed_url = urlparse(url)
# FIXME: Support determining API URL better
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
yield f"Fetching Dataverse record {record_id}.\n" yield f"Fetching Dataverse record {url}.\n"
url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}'
resp = self.urlopen(url, headers={"accept": "application/json"}) for fobj in self.get_datafiles(url):
record = resp.json()["data"]
for fobj in deep_get(record, "latestVersion.files"):
file_url = ( file_url = (
# without format=original you get the preservation format (plain text, tab separated) # without format=original you get the preservation format (plain text, tab separated)
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' f'{base_url}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
) )
filename = fobj["label"] filename = fobj["label"]
original_filename = fobj["dataFile"].get("originalFileName", None) original_filename = fobj["dataFile"].get("originalFileName", None)
@ -128,5 +213,9 @@ class Dataverse(DoiProvider):
@property @property
def content_id(self): def content_id(self):
"""The Dataverse persistent identifier.""" """
return self.record_id The Dataverse persistent identifier.
Only valid if called after a succesfull fetch
"""
return self.persistent_id

Wyświetl plik

@ -46,21 +46,28 @@ class DoiProvider(ContentProvider):
# Transform a DOI to a URL # Transform a DOI to a URL
# If not a doi, assume we have a URL and return # If not a doi, assume we have a URL and return
if is_doi(doi): if is_doi(doi):
doi = normalize_doi(doi) normalized_doi = normalize_doi(doi)
try: # Use the doi.org resolver API
resp = self._request(f"https://doi.org/{doi}") # documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api
resp.raise_for_status() req_url = f"https://doi.org/api/handles/{normalized_doi}"
except HTTPError as e: resp = self._request(req_url)
# If the DOI doesn't exist, just return URL if resp.status_code == 404:
if e.response.status_code == 404: # Not a doi, return what we were passed in
return doi return doi
# Reraise any other errors because if the DOI service is down (or elif resp.status_code == 200:
# we hit a rate limit) we don't want to silently continue to the data = resp.json()
# default Git provider as this leads to a misleading error. # Pick the first URL we find from the doi response
self.log.error(f"DOI {doi} does not resolve: {e}") for v in data["values"]:
if v["type"] == "URL":
return v["data"]["value"]
# No URLs found for this doi, what do we do?
self.log.error("DOI {normalized_doi} doesn't point to any URLs")
return doi
else:
# If we get any other status codes, raise error
raise raise
return resp.url
else: else:
# Just return what is actulally just a URL # Just return what is actulally just a URL
return doi return doi

Wyświetl plik

@ -0,0 +1,143 @@
import hashlib
import os
from tempfile import TemporaryDirectory
import pytest
from repo2docker.contentproviders import Dataverse
@pytest.mark.parametrize(
("doi", "resolved"),
[
(
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
),
(
"10.7910/DVN/6ZXAGT/3YRRYJ",
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
),
(
"10.7910/DVN/TJCLKP",
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
),
(
"https://dataverse.harvard.edu/api/access/datafile/3323458",
"https://dataverse.harvard.edu/api/access/datafile/3323458",
),
(
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
),
("/some/random/string", None),
("https://example.com/path/here", None),
# Non dataverse DOIs
("https://doi.org/10.21105/joss.01277", None),
],
)
def test_detect(doi, resolved):
assert Dataverse().detect(doi) == resolved
@pytest.mark.parametrize(
("url", "persistent_id", "is_ambiguous"),
[
(
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"doi:10.7910/DVN/6ZXAGT",
False,
),
(
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
"doi:10.7910/DVN/TJCLKP",
True,
),
(
"https://dataverse.harvard.edu/api/access/datafile/3323458",
"doi:10.7910/DVN/3MJ7IR",
False,
),
(
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
"hdl:11529/10016",
False,
),
],
)
def test_get_persistent_id(url, persistent_id, is_ambiguous):
assert Dataverse().parse_dataverse_url(url) == (persistent_id, is_ambiguous)
@pytest.mark.parametrize(
("specs", "md5tree"),
[
(
(
"doi:10.7910/DVN/TJCLKP",
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
),
{
"data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694",
"data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917",
"code/language.py": "9d61582bcf497c83bbd1ed0eed3c772e",
},
),
(
(
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
),
{
"ArchaeoGLOBE-master/analysis/figures/1_response_distribution.png": "243c6a3dd66bc3c84102829b277ef333",
"ArchaeoGLOBE-master/analysis/figures/2_trends_map_knowledge.png": "2ace6ae9d470dda6cf2f9f9a6588171a",
"ArchaeoGLOBE-master/analysis/figures/3_trends_global.png": "63ccd0a7b2d20440cd8f418d4ee88c4d",
"ArchaeoGLOBE-master/analysis/figures/4_consensus_transitions.png": "facfaedabeac77c4496d4b9e962a917f",
"ArchaeoGLOBE-master/analysis/figures/5_ArchaeoGLOBE_HYDE_comparison.png": "8e002e4d50f179fc1808f562b1353588",
"ArchaeoGLOBE-master/apt.txt": "b4224032da6c71d48f46c9b78fc6ed77",
"ArchaeoGLOBE-master/analysis/archaeoglobe.pdf": "f575be4790efc963ef1bd40d097cc06d",
"ArchaeoGLOBE-master/analysis/archaeoglobe.Rmd": "f37d5f7993fde9ebd64d16b20fc22905",
"ArchaeoGLOBE-master/ArchaeoGLOBE.Rproj": "d0250e7918993bab1e707358fe5633e0",
"ArchaeoGLOBE-master/CONDUCT.md": "f87ef290340322089c32b4e573d8f1e8",
"ArchaeoGLOBE-master/.circleci/config.yml": "6eaa54073a682b3195d8fab3a9dd8344",
"ArchaeoGLOBE-master/CONTRIBUTING.md": "b3a6abfc749dd155a3049f94a855bf9f",
"ArchaeoGLOBE-master/DESCRIPTION": "745ef979494999e483987de72c0adfbd",
"ArchaeoGLOBE-master/dockerfile": "aedce68e5a7d6e79cbb24c9cffeae593",
"ArchaeoGLOBE-master/.binder/Dockerfile": "7564a41246ba99b60144afb1d3b6d7de",
"ArchaeoGLOBE-master/.gitignore": "62c1482e4febbd35dc02fb7e2a31246b",
"ArchaeoGLOBE-master/analysis/data/derived-data/hyde_crop_prop.RDS": "2aea7748b5586923b0de9d13af58e59d",
"ArchaeoGLOBE-master/analysis/data/derived-data/kk_anthro_prop.RDS": "145a9e5dd2c95625626a720b52178b70",
"ArchaeoGLOBE-master/LICENSE.md": "3aa9d41a92a57944bd4590e004898445",
"ArchaeoGLOBE-master/analysis/data/derived-data/placeholder": "d41d8cd98f00b204e9800998ecf8427e",
"ArchaeoGLOBE-master/.Rbuildignore": "df15e4fed49abd685b536fef4472b01f",
"ArchaeoGLOBE-master/README.md": "0b0faabe580c4d76a0e0d64a4f54bca4",
"ArchaeoGLOBE-master/analysis/data/derived-data/README.md": "547fd1a6e874f6178b1cf525b5b9ae72",
"ArchaeoGLOBE-master/analysis/figures/S1_FHG_consensus.png": "d2584352e5442b33e4b23e361ca70fe1",
"ArchaeoGLOBE-master/analysis/figures/S2_EXAG_consensus.png": "513eddfdad01fd01a20263a55ca6dbe3",
"ArchaeoGLOBE-master/analysis/figures/S3_INAG_consensus.png": "b16ba0ecd21b326f873209a7e55a8deb",
"ArchaeoGLOBE-master/analysis/figures/S4_PAS_consensus.png": "05695f9412337a00c1cb6d1757d0ec5c",
"ArchaeoGLOBE-master/analysis/figures/S5_URBAN_consensus.png": "10119f7495d3b8e7ad7f8a0770574f15",
"ArchaeoGLOBE-master/analysis/figures/S6_trends_map_landuse.png": "b1db7c97f39ccfc3a9e094c3e6307af0",
"ArchaeoGLOBE-master/analysis/figures/S7_ArchaeoGLOBE_KK10_comparison.png": "30341748324f5f66acadb34c114c3e9d",
},
),
],
)
def test_fetch(specs: list[str], md5tree):
dv = Dataverse()
for spec in specs:
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(dv.detect(spec), d):
output.append(l)
# Verify md5 sum of the files we expect to find
# We are using md5 instead of something more secure because that is what
# dataverse itself uses
for subpath, expected_sha in md5tree.items():
with open(os.path.join(d, subpath), "rb") as f:
h = hashlib.md5()
h.update(f.read())
assert h.hexdigest() == expected_sha

Wyświetl plik

@ -1,160 +0,0 @@
import json
import os
import re
from io import BytesIO
from tempfile import TemporaryDirectory
from unittest.mock import patch
from urllib.parse import urlsplit
from urllib.request import Request, urlopen
import pytest
from repo2docker.contentproviders import Dataverse
test_dv = Dataverse()
harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")
cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")
test_hosts = [
(
[
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"10.7910/DVN/6ZXAGT",
"https://dataverse.harvard.edu/api/access/datafile/3323458",
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
],
[
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"},
{"host": cimmyt_dv, "record": "hdl:11529/10016"},
],
)
]
doi_responses = {
"https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": (
"https://dataverse.harvard.edu/file.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"
),
"https://doi.org/10.7910/DVN/6ZXAGT": (
"https://dataverse.harvard.edu/dataset.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT"
),
"https://dataverse.harvard.edu/api/access/datafile/3323458": (
"https://dataverse.harvard.edu/api/access/datafile/3323458"
),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
),
}
@pytest.mark.parametrize("test_input, expected", test_hosts)
def test_detect_dataverse(test_input, expected, requests_mock):
def doi_resolver(req, context):
resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
requests_mock.get(re.compile("https://"), json=doi_resolver)
requests_mock.get(
"https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file",
json={
"data": {
"count_in_response": 1,
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
}
},
)
assert requests_mock.call_count == 0
# valid Dataverse DOIs trigger this content provider
assert Dataverse().detect(test_input[0]) == expected[0]
# 4: doi resolution (302), File, doi resolution (302), then dataset
assert requests_mock.call_count == 4
requests_mock.reset_mock()
assert Dataverse().detect(test_input[1]) == expected[0]
# 2: doi (302), dataset
assert requests_mock.call_count == 2
requests_mock.reset_mock()
assert Dataverse().detect(test_input[2]) == expected[0]
# 1: datafile (search dataverse for the file id)
assert requests_mock.call_count == 1
requests_mock.reset_mock()
assert Dataverse().detect(test_input[3]) == expected[1]
requests_mock.reset_mock()
# Don't trigger the Dataverse content provider
assert Dataverse().detect("/some/path/here") is None
assert Dataverse().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Dataverse
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
@pytest.fixture
def dv_files(tmpdir):
f1 = tmpdir.join("some-file.txt")
f1.write("some content")
f2 = tmpdir.mkdir("directory").join("some-other-file.txt")
f2.write("some other content")
f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt")
f3.write("yet another content")
return [f1, f2, f3]
def test_dataverse_fetch(dv_files, requests_mock):
mock_response = {
"data": {
"latestVersion": {
"files": [
{"dataFile": {"id": 1}, "label": "some-file.txt"},
{
"dataFile": {"id": 2},
"label": "some-other-file.txt",
"directoryLabel": "directory",
},
{
"dataFile": {"id": 3},
"label": "the-other-file.txt",
"directoryLabel": "directory/subdirectory",
},
]
}
}
}
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
def mock_filecontent(req, context):
parts = urlsplit(req.url)
file_no = int(parts.path.split("/")[-1]) - 1
return open(dv_files[file_no], "rb").read()
requests_mock.get(
"https://dataverse.harvard.edu/api/datasets/"
":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT",
json=mock_response,
)
requests_mock.get(
re.compile("https://dataverse.harvard.edu/api/access/datafile"),
content=mock_filecontent,
)
dv = Dataverse()
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = {"directory", "some-file.txt"}
assert expected == unpacked_files
assert os.path.isfile(
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
)

Wyświetl plik

@ -33,7 +33,7 @@ def test_url_headers(requests_mock):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"requested_doi, expected", "requested_doi, expected",
[ [
("10.5281/zenodo.3242074", "https://zenodo.org/records/3242074"), ("10.5281/zenodo.3242074", "https://zenodo.org/record/3242074"),
# Unresolving DOI: # Unresolving DOI:
("10.1/1234", "10.1/1234"), ("10.1/1234", "10.1/1234"),
], ],

Wyświetl plik

@ -21,16 +21,9 @@ test_content_ids = [
@pytest.mark.parametrize("link,expected", test_content_ids) @pytest.mark.parametrize("link,expected", test_content_ids)
def test_content_id(link, expected, requests_mock): def test_content_id(link, expected):
def mocked_get(req, context):
if req.url.startswith("https://doi.org"):
context.status_code = 302
context.headers["Location"] = link
return link
requests_mock.get(re.compile("https://"), text=mocked_get)
fig = Figshare() fig = Figshare()
fig.detect("10.6084/m9.figshare.9782777") fig.detect(link)
assert fig.content_id == expected assert fig.content_id == expected

Wyświetl plik

@ -34,20 +34,14 @@ hydroshare_data = {
} }
def test_content_id(requests_mock): def test_content_id():
requests_mock.get(re.compile("https://"), json=hydroshare_data)
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
hydro = Hydroshare() hydro = Hydroshare()
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757" assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1585005408"
def test_detect_hydroshare(requests_mock): def test_detect_hydroshare():
requests_mock.get(re.compile("https://"), json=hydroshare_data)
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
# valid Hydroshare DOIs trigger this content provider # valid Hydroshare DOIs trigger this content provider
expected = { expected = {
"host": { "host": {
@ -59,7 +53,7 @@ def test_detect_hydroshare(requests_mock):
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
}, },
"resource": "b8f6eae9d89241cf8b5904033460af61", "resource": "b8f6eae9d89241cf8b5904033460af61",
"version": "1569427757", "version": "1585005408",
} }
assert ( assert (
@ -68,16 +62,10 @@ def test_detect_hydroshare(requests_mock):
) )
== expected == expected
) )
# assert a call to urlopen was called to fetch version
assert requests_mock.call_count == 1
requests_mock.reset_mock()
assert ( assert (
Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected
) )
# assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version
assert requests_mock.call_count == 3
requests_mock.reset_mock()
assert ( assert (
Hydroshare().detect( Hydroshare().detect(
@ -85,9 +73,6 @@ def test_detect_hydroshare(requests_mock):
) )
== expected == expected
) )
# assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version
assert requests_mock.call_count == 3
requests_mock.reset_mock()
# Don't trigger the Hydroshare content provider # Don't trigger the Hydroshare content provider
assert Hydroshare().detect("/some/path/here") is None assert Hydroshare().detect("/some/path/here") is None

Wyświetl plik

@ -21,18 +21,7 @@ doi_responses = {
} }
def doi_resolver(req, context): def test_content_id():
resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
def test_content_id(requests_mock):
requests_mock.get(re.compile("https://"), json=doi_resolver)
zen = Zenodo() zen = Zenodo()
zen.detect("10.5281/zenodo.3232985") zen.detect("10.5281/zenodo.3232985")
assert zen.content_id == "3232985" assert zen.content_id == "3232985"
@ -60,15 +49,11 @@ test_hosts = [
@pytest.mark.parametrize("test_input,expected", test_hosts) @pytest.mark.parametrize("test_input,expected", test_hosts)
def test_detect_zenodo(test_input, expected, requests_mock): def test_detect_zenodo(test_input, expected):
requests_mock.get(re.compile("https://"), json=doi_resolver)
# valid Zenodo DOIs trigger this content provider # valid Zenodo DOIs trigger this content provider
assert Zenodo().detect(test_input[0]) == expected assert Zenodo().detect(test_input[0]) == expected
assert Zenodo().detect(test_input[1]) == expected assert Zenodo().detect(test_input[1]) == expected
assert Zenodo().detect(test_input[2]) == expected assert Zenodo().detect(test_input[2]) == expected
# only two of the three calls above have to resolve a DOI (2 req per doi resolution)
assert requests_mock.call_count == 4
requests_mock.reset_mock()
# Don't trigger the Zenodo content provider # Don't trigger the Zenodo content provider
assert Zenodo().detect("/some/path/here") is None assert Zenodo().detect("/some/path/here") is None