Merge pull request #1390 from yuvipanda/use-api

Use REST APIs to resolve DOIs + cleanup dataverse provider
pull/1393/merge
Min RK 2024-12-20 08:41:25 +01:00 zatwierdzone przez GitHub
commit b7c151536d
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
9 zmienionych plików z 312 dodań i 269 usunięć

Wyświetl plik

@ -60,6 +60,7 @@ jobs:
- r
- unit
- venv
- contentproviders
include:
# The actions/setup-python action with Python version 3.6 isn't
# possible to use with the ubuntu-22.04 runner, so we use ubuntu-20.04

Wyświetl plik

@ -1,9 +1,11 @@
import hashlib
import json
import os
import shutil
from urllib.parse import parse_qs, urlparse, urlunparse
from typing import List, Tuple
from urllib.parse import parse_qs, urlparse
from ..utils import copytree, deep_get
from ..utils import copytree, deep_get, is_doi
from .doi import DoiProvider
@ -23,10 +25,11 @@ class Dataverse(DoiProvider):
self.hosts = json.load(fp)["installations"]
super().__init__()
def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Dataverse dataset.
def detect(self, spec, ref=None, extra_args=None):
"""
Detect if given spec is hosted on dataverse
Handles:
The spec can be:
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
- URL {siteURL}/api/access/datafile/{fileId}
@ -35,9 +38,11 @@ class Dataverse(DoiProvider):
- https://dataverse.harvard.edu/api/access/datafile/3323458
- doi:10.7910/DVN/6ZXAGT
- doi:10.7910/DVN/6ZXAGT/3YRRYJ
"""
url = self.doi2url(doi)
if is_doi(spec):
url = self.doi2url(spec)
else:
url = spec
# Parse the url, to get the base for later API calls
parsed_url = urlparse(url)
@ -53,57 +58,137 @@ class Dataverse(DoiProvider):
if host is None:
return
query_args = parse_qs(parsed_url.query)
# Corner case handling
if parsed_url.path.startswith("/file.xhtml"):
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
new_doi = doi.rsplit("/", 1)[0]
if new_doi == doi:
# tough luck :( Avoid inifite recursion and exit.
return
return self.detect(new_doi)
elif parsed_url.path.startswith("/api/access/datafile"):
# Raw url pointing to a datafile is a typical output from an External Tool integration
entity_id = os.path.basename(parsed_url.path)
search_query = "q=entityId:" + entity_id + "&type=file"
# Knowing the file identifier query search api to get parent dataset
search_url = urlunparse(
parsed_url._replace(path="/api/search", query=search_query)
# At this point, we *know* this is a dataverse URL, because:
# 1. The DOI resolved to a particular host (if using DOI)
# 2. The host is in the list of known dataverse installations
#
# We don't know exactly what kind of dataverse object this is, but
# that can be figured out during fetch as needed
return url
def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str:
"""
Return the persistent_id (DOI) of a dataset that a given file_id (int or doi) belongs to
"""
if file_id.isdigit():
# the file_id is an integer, rather than a persistent id (DOI)
api_url = f"{base_url}/api/files/{file_id}?returnDatasetVersion=true"
else:
# the file_id is a doi itself
api_url = f"{base_url}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true"
resp = self._request(api_url)
if resp.status_code == 404:
raise ValueError(f"File with id {file_id} not found in {base_url}")
resp.raise_for_status()
data = resp.json()["data"]
return data["datasetVersion"]["datasetPersistentId"]
def parse_dataverse_url(self, url: str) -> Tuple[str, bool]:
"""
Parse the persistent id out of a dataverse URL
persistent_id can point to either a dataset or a file. The second return
value is False if we know that the persistent id is a file or a dataset,
and True if it is ambiguous.
Raises a ValueError if we can not parse the url
"""
parsed_url = urlparse(url)
path = parsed_url.path
qs = parse_qs(parsed_url.query)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
is_ambiguous = False
# https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
if path.startswith("/citation"):
is_ambiguous = True
persistent_id = qs["persistentId"][0]
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
elif path.startswith("/dataset.xhtml"):
# https://dataverse.harvard.edu/api/access/datafile/3323458
persistent_id = qs["persistentId"][0]
elif path.startswith("/api/access/datafile"):
# What we have here is an entity id, which we can use to get a persistentId
file_id = os.path.basename(path)
persistent_id = self.get_dataset_id_from_file_id(base_url, file_id)
elif parsed_url.path.startswith("/file.xhtml"):
file_persistent_id = qs["persistentId"][0]
persistent_id = self.get_dataset_id_from_file_id(
base_url, file_persistent_id
)
else:
raise ValueError(
f"Could not determine persistent id for dataverse URL {url}"
)
self.log.debug("Querying Dataverse: " + search_url)
data = self.urlopen(search_url).json()["data"]
if data["count_in_response"] != 1:
self.log.debug(
f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n"
)
return
self.record_id = deep_get(data, "items.0.dataset_persistent_id")
elif (
parsed_url.path.startswith("/dataset.xhtml")
and "persistentId" in query_args
):
self.record_id = deep_get(query_args, "persistentId.0")
return persistent_id, is_ambiguous
if hasattr(self, "record_id"):
return {"record": self.record_id, "host": host}
def get_datafiles(self, url: str) -> List[dict]:
"""
Return a list of dataFiles for given persistent_id
Supports the following *dataset* URL styles:
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
- /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
Supports the following *file* URL styles (entire dataset file belongs to will be fetched):
- /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
- /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
If a URL can not be parsed, throw an exception
"""
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
persistent_id, is_ambiguous = self.parse_dataverse_url(url)
dataset_api_url = (
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
)
resp = self._request(dataset_api_url, headers={"accept": "application/json"})
if resp.status_code == 404 and is_ambiguous:
# It's possible this is a *file* persistent_id, not a dataset one
persistent_id = self.get_dataset_id_from_file_id(base_url, persistent_id)
dataset_api_url = (
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
)
resp = self._request(
dataset_api_url, headers={"accept": "application/json"}
)
if resp.status_code == 404:
# This persistent id is just not here
raise ValueError(f"{persistent_id} on {base_url} is not found")
# We already handled 404, raise error for everything else
resp.raise_for_status()
# We know the exact persistent_id of the dataset we fetched now
# Save it for use as content_id
self.persistent_id = persistent_id
data = resp.json()["data"]
return data["latestVersion"]["files"]
def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Dataverse dataset."""
record_id = spec["record"]
host = spec["host"]
url = spec
parsed_url = urlparse(url)
# FIXME: Support determining API URL better
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
yield f"Fetching Dataverse record {record_id}.\n"
url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}'
yield f"Fetching Dataverse record {url}.\n"
resp = self.urlopen(url, headers={"accept": "application/json"})
record = resp.json()["data"]
for fobj in deep_get(record, "latestVersion.files"):
for fobj in self.get_datafiles(url):
file_url = (
# without format=original you get the preservation format (plain text, tab separated)
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
f'{base_url}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
)
filename = fobj["label"]
original_filename = fobj["dataFile"].get("originalFileName", None)
@ -128,5 +213,9 @@ class Dataverse(DoiProvider):
@property
def content_id(self):
"""The Dataverse persistent identifier."""
return self.record_id
"""
The Dataverse persistent identifier.
Only valid if called after a succesfull fetch
"""
return self.persistent_id

Wyświetl plik

@ -46,21 +46,28 @@ class DoiProvider(ContentProvider):
# Transform a DOI to a URL
# If not a doi, assume we have a URL and return
if is_doi(doi):
doi = normalize_doi(doi)
normalized_doi = normalize_doi(doi)
try:
resp = self._request(f"https://doi.org/{doi}")
resp.raise_for_status()
except HTTPError as e:
# If the DOI doesn't exist, just return URL
if e.response.status_code == 404:
return doi
# Reraise any other errors because if the DOI service is down (or
# we hit a rate limit) we don't want to silently continue to the
# default Git provider as this leads to a misleading error.
self.log.error(f"DOI {doi} does not resolve: {e}")
# Use the doi.org resolver API
# documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api
req_url = f"https://doi.org/api/handles/{normalized_doi}"
resp = self._request(req_url)
if resp.status_code == 404:
# Not a doi, return what we were passed in
return doi
elif resp.status_code == 200:
data = resp.json()
# Pick the first URL we find from the doi response
for v in data["values"]:
if v["type"] == "URL":
return v["data"]["value"]
# No URLs found for this doi, what do we do?
self.log.error("DOI {normalized_doi} doesn't point to any URLs")
return doi
else:
# If we get any other status codes, raise error
raise
return resp.url
else:
# Just return what is actulally just a URL
return doi

Wyświetl plik

@ -0,0 +1,143 @@
import hashlib
import os
from tempfile import TemporaryDirectory
import pytest
from repo2docker.contentproviders import Dataverse
@pytest.mark.parametrize(
("doi", "resolved"),
[
(
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
),
(
"10.7910/DVN/6ZXAGT/3YRRYJ",
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
),
(
"10.7910/DVN/TJCLKP",
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
),
(
"https://dataverse.harvard.edu/api/access/datafile/3323458",
"https://dataverse.harvard.edu/api/access/datafile/3323458",
),
(
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
),
("/some/random/string", None),
("https://example.com/path/here", None),
# Non dataverse DOIs
("https://doi.org/10.21105/joss.01277", None),
],
)
def test_detect(doi, resolved):
assert Dataverse().detect(doi) == resolved
@pytest.mark.parametrize(
("url", "persistent_id", "is_ambiguous"),
[
(
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"doi:10.7910/DVN/6ZXAGT",
False,
),
(
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
"doi:10.7910/DVN/TJCLKP",
True,
),
(
"https://dataverse.harvard.edu/api/access/datafile/3323458",
"doi:10.7910/DVN/3MJ7IR",
False,
),
(
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
"hdl:11529/10016",
False,
),
],
)
def test_get_persistent_id(url, persistent_id, is_ambiguous):
assert Dataverse().parse_dataverse_url(url) == (persistent_id, is_ambiguous)
@pytest.mark.parametrize(
("specs", "md5tree"),
[
(
(
"doi:10.7910/DVN/TJCLKP",
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
),
{
"data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694",
"data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917",
"code/language.py": "9d61582bcf497c83bbd1ed0eed3c772e",
},
),
(
(
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
),
{
"ArchaeoGLOBE-master/analysis/figures/1_response_distribution.png": "243c6a3dd66bc3c84102829b277ef333",
"ArchaeoGLOBE-master/analysis/figures/2_trends_map_knowledge.png": "2ace6ae9d470dda6cf2f9f9a6588171a",
"ArchaeoGLOBE-master/analysis/figures/3_trends_global.png": "63ccd0a7b2d20440cd8f418d4ee88c4d",
"ArchaeoGLOBE-master/analysis/figures/4_consensus_transitions.png": "facfaedabeac77c4496d4b9e962a917f",
"ArchaeoGLOBE-master/analysis/figures/5_ArchaeoGLOBE_HYDE_comparison.png": "8e002e4d50f179fc1808f562b1353588",
"ArchaeoGLOBE-master/apt.txt": "b4224032da6c71d48f46c9b78fc6ed77",
"ArchaeoGLOBE-master/analysis/archaeoglobe.pdf": "f575be4790efc963ef1bd40d097cc06d",
"ArchaeoGLOBE-master/analysis/archaeoglobe.Rmd": "f37d5f7993fde9ebd64d16b20fc22905",
"ArchaeoGLOBE-master/ArchaeoGLOBE.Rproj": "d0250e7918993bab1e707358fe5633e0",
"ArchaeoGLOBE-master/CONDUCT.md": "f87ef290340322089c32b4e573d8f1e8",
"ArchaeoGLOBE-master/.circleci/config.yml": "6eaa54073a682b3195d8fab3a9dd8344",
"ArchaeoGLOBE-master/CONTRIBUTING.md": "b3a6abfc749dd155a3049f94a855bf9f",
"ArchaeoGLOBE-master/DESCRIPTION": "745ef979494999e483987de72c0adfbd",
"ArchaeoGLOBE-master/dockerfile": "aedce68e5a7d6e79cbb24c9cffeae593",
"ArchaeoGLOBE-master/.binder/Dockerfile": "7564a41246ba99b60144afb1d3b6d7de",
"ArchaeoGLOBE-master/.gitignore": "62c1482e4febbd35dc02fb7e2a31246b",
"ArchaeoGLOBE-master/analysis/data/derived-data/hyde_crop_prop.RDS": "2aea7748b5586923b0de9d13af58e59d",
"ArchaeoGLOBE-master/analysis/data/derived-data/kk_anthro_prop.RDS": "145a9e5dd2c95625626a720b52178b70",
"ArchaeoGLOBE-master/LICENSE.md": "3aa9d41a92a57944bd4590e004898445",
"ArchaeoGLOBE-master/analysis/data/derived-data/placeholder": "d41d8cd98f00b204e9800998ecf8427e",
"ArchaeoGLOBE-master/.Rbuildignore": "df15e4fed49abd685b536fef4472b01f",
"ArchaeoGLOBE-master/README.md": "0b0faabe580c4d76a0e0d64a4f54bca4",
"ArchaeoGLOBE-master/analysis/data/derived-data/README.md": "547fd1a6e874f6178b1cf525b5b9ae72",
"ArchaeoGLOBE-master/analysis/figures/S1_FHG_consensus.png": "d2584352e5442b33e4b23e361ca70fe1",
"ArchaeoGLOBE-master/analysis/figures/S2_EXAG_consensus.png": "513eddfdad01fd01a20263a55ca6dbe3",
"ArchaeoGLOBE-master/analysis/figures/S3_INAG_consensus.png": "b16ba0ecd21b326f873209a7e55a8deb",
"ArchaeoGLOBE-master/analysis/figures/S4_PAS_consensus.png": "05695f9412337a00c1cb6d1757d0ec5c",
"ArchaeoGLOBE-master/analysis/figures/S5_URBAN_consensus.png": "10119f7495d3b8e7ad7f8a0770574f15",
"ArchaeoGLOBE-master/analysis/figures/S6_trends_map_landuse.png": "b1db7c97f39ccfc3a9e094c3e6307af0",
"ArchaeoGLOBE-master/analysis/figures/S7_ArchaeoGLOBE_KK10_comparison.png": "30341748324f5f66acadb34c114c3e9d",
},
),
],
)
def test_fetch(specs: list[str], md5tree):
dv = Dataverse()
for spec in specs:
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(dv.detect(spec), d):
output.append(l)
# Verify md5 sum of the files we expect to find
# We are using md5 instead of something more secure because that is what
# dataverse itself uses
for subpath, expected_sha in md5tree.items():
with open(os.path.join(d, subpath), "rb") as f:
h = hashlib.md5()
h.update(f.read())
assert h.hexdigest() == expected_sha

Wyświetl plik

@ -1,160 +0,0 @@
import json
import os
import re
from io import BytesIO
from tempfile import TemporaryDirectory
from unittest.mock import patch
from urllib.parse import urlsplit
from urllib.request import Request, urlopen
import pytest
from repo2docker.contentproviders import Dataverse
test_dv = Dataverse()
harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")
cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")
test_hosts = [
(
[
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"10.7910/DVN/6ZXAGT",
"https://dataverse.harvard.edu/api/access/datafile/3323458",
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
],
[
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"},
{"host": cimmyt_dv, "record": "hdl:11529/10016"},
],
)
]
doi_responses = {
"https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": (
"https://dataverse.harvard.edu/file.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"
),
"https://doi.org/10.7910/DVN/6ZXAGT": (
"https://dataverse.harvard.edu/dataset.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT"
),
"https://dataverse.harvard.edu/api/access/datafile/3323458": (
"https://dataverse.harvard.edu/api/access/datafile/3323458"
),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
),
}
@pytest.mark.parametrize("test_input, expected", test_hosts)
def test_detect_dataverse(test_input, expected, requests_mock):
def doi_resolver(req, context):
resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
requests_mock.get(re.compile("https://"), json=doi_resolver)
requests_mock.get(
"https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file",
json={
"data": {
"count_in_response": 1,
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
}
},
)
assert requests_mock.call_count == 0
# valid Dataverse DOIs trigger this content provider
assert Dataverse().detect(test_input[0]) == expected[0]
# 4: doi resolution (302), File, doi resolution (302), then dataset
assert requests_mock.call_count == 4
requests_mock.reset_mock()
assert Dataverse().detect(test_input[1]) == expected[0]
# 2: doi (302), dataset
assert requests_mock.call_count == 2
requests_mock.reset_mock()
assert Dataverse().detect(test_input[2]) == expected[0]
# 1: datafile (search dataverse for the file id)
assert requests_mock.call_count == 1
requests_mock.reset_mock()
assert Dataverse().detect(test_input[3]) == expected[1]
requests_mock.reset_mock()
# Don't trigger the Dataverse content provider
assert Dataverse().detect("/some/path/here") is None
assert Dataverse().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Dataverse
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
@pytest.fixture
def dv_files(tmpdir):
f1 = tmpdir.join("some-file.txt")
f1.write("some content")
f2 = tmpdir.mkdir("directory").join("some-other-file.txt")
f2.write("some other content")
f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt")
f3.write("yet another content")
return [f1, f2, f3]
def test_dataverse_fetch(dv_files, requests_mock):
mock_response = {
"data": {
"latestVersion": {
"files": [
{"dataFile": {"id": 1}, "label": "some-file.txt"},
{
"dataFile": {"id": 2},
"label": "some-other-file.txt",
"directoryLabel": "directory",
},
{
"dataFile": {"id": 3},
"label": "the-other-file.txt",
"directoryLabel": "directory/subdirectory",
},
]
}
}
}
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
def mock_filecontent(req, context):
parts = urlsplit(req.url)
file_no = int(parts.path.split("/")[-1]) - 1
return open(dv_files[file_no], "rb").read()
requests_mock.get(
"https://dataverse.harvard.edu/api/datasets/"
":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT",
json=mock_response,
)
requests_mock.get(
re.compile("https://dataverse.harvard.edu/api/access/datafile"),
content=mock_filecontent,
)
dv = Dataverse()
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = {"directory", "some-file.txt"}
assert expected == unpacked_files
assert os.path.isfile(
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
)

Wyświetl plik

@ -33,7 +33,7 @@ def test_url_headers(requests_mock):
@pytest.mark.parametrize(
"requested_doi, expected",
[
("10.5281/zenodo.3242074", "https://zenodo.org/records/3242074"),
("10.5281/zenodo.3242074", "https://zenodo.org/record/3242074"),
# Unresolving DOI:
("10.1/1234", "10.1/1234"),
],

Wyświetl plik

@ -21,16 +21,9 @@ test_content_ids = [
@pytest.mark.parametrize("link,expected", test_content_ids)
def test_content_id(link, expected, requests_mock):
def mocked_get(req, context):
if req.url.startswith("https://doi.org"):
context.status_code = 302
context.headers["Location"] = link
return link
requests_mock.get(re.compile("https://"), text=mocked_get)
def test_content_id(link, expected):
fig = Figshare()
fig.detect("10.6084/m9.figshare.9782777")
fig.detect(link)
assert fig.content_id == expected

Wyświetl plik

@ -34,20 +34,14 @@ hydroshare_data = {
}
def test_content_id(requests_mock):
requests_mock.get(re.compile("https://"), json=hydroshare_data)
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
def test_content_id():
hydro = Hydroshare()
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1585005408"
def test_detect_hydroshare(requests_mock):
requests_mock.get(re.compile("https://"), json=hydroshare_data)
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
def test_detect_hydroshare():
# valid Hydroshare DOIs trigger this content provider
expected = {
"host": {
@ -59,7 +53,7 @@ def test_detect_hydroshare(requests_mock):
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
},
"resource": "b8f6eae9d89241cf8b5904033460af61",
"version": "1569427757",
"version": "1585005408",
}
assert (
@ -68,16 +62,10 @@ def test_detect_hydroshare(requests_mock):
)
== expected
)
# assert a call to urlopen was called to fetch version
assert requests_mock.call_count == 1
requests_mock.reset_mock()
assert (
Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected
)
# assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version
assert requests_mock.call_count == 3
requests_mock.reset_mock()
assert (
Hydroshare().detect(
@ -85,9 +73,6 @@ def test_detect_hydroshare(requests_mock):
)
== expected
)
# assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version
assert requests_mock.call_count == 3
requests_mock.reset_mock()
# Don't trigger the Hydroshare content provider
assert Hydroshare().detect("/some/path/here") is None

Wyświetl plik

@ -21,18 +21,7 @@ doi_responses = {
}
def doi_resolver(req, context):
resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
def test_content_id(requests_mock):
requests_mock.get(re.compile("https://"), json=doi_resolver)
def test_content_id():
zen = Zenodo()
zen.detect("10.5281/zenodo.3232985")
assert zen.content_id == "3232985"
@ -60,15 +49,11 @@ test_hosts = [
@pytest.mark.parametrize("test_input,expected", test_hosts)
def test_detect_zenodo(test_input, expected, requests_mock):
requests_mock.get(re.compile("https://"), json=doi_resolver)
def test_detect_zenodo(test_input, expected):
# valid Zenodo DOIs trigger this content provider
assert Zenodo().detect(test_input[0]) == expected
assert Zenodo().detect(test_input[1]) == expected
assert Zenodo().detect(test_input[2]) == expected
# only two of the three calls above have to resolve a DOI (2 req per doi resolution)
assert requests_mock.call_count == 4
requests_mock.reset_mock()
# Don't trigger the Zenodo content provider
assert Zenodo().detect("/some/path/here") is None