kopia lustrzana https://github.com/jupyterhub/repo2docker
Merge pull request #1390 from yuvipanda/use-api
Use REST APIs to resolve DOIs + cleanup dataverse providerpull/1393/merge
commit
b7c151536d
|
@ -60,6 +60,7 @@ jobs:
|
|||
- r
|
||||
- unit
|
||||
- venv
|
||||
- contentproviders
|
||||
include:
|
||||
# The actions/setup-python action with Python version 3.6 isn't
|
||||
# possible to use with the ubuntu-22.04 runner, so we use ubuntu-20.04
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
from urllib.parse import parse_qs, urlparse, urlunparse
|
||||
from typing import List, Tuple
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
from ..utils import copytree, deep_get
|
||||
from ..utils import copytree, deep_get, is_doi
|
||||
from .doi import DoiProvider
|
||||
|
||||
|
||||
|
@ -23,10 +25,11 @@ class Dataverse(DoiProvider):
|
|||
self.hosts = json.load(fp)["installations"]
|
||||
super().__init__()
|
||||
|
||||
def detect(self, doi, ref=None, extra_args=None):
|
||||
"""Trigger this provider for things that resolve to a Dataverse dataset.
|
||||
def detect(self, spec, ref=None, extra_args=None):
|
||||
"""
|
||||
Detect if given spec is hosted on dataverse
|
||||
|
||||
Handles:
|
||||
The spec can be:
|
||||
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
|
||||
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
|
||||
- URL {siteURL}/api/access/datafile/{fileId}
|
||||
|
@ -35,9 +38,11 @@ class Dataverse(DoiProvider):
|
|||
- https://dataverse.harvard.edu/api/access/datafile/3323458
|
||||
- doi:10.7910/DVN/6ZXAGT
|
||||
- doi:10.7910/DVN/6ZXAGT/3YRRYJ
|
||||
|
||||
"""
|
||||
url = self.doi2url(doi)
|
||||
if is_doi(spec):
|
||||
url = self.doi2url(spec)
|
||||
else:
|
||||
url = spec
|
||||
# Parse the url, to get the base for later API calls
|
||||
parsed_url = urlparse(url)
|
||||
|
||||
|
@ -53,57 +58,137 @@ class Dataverse(DoiProvider):
|
|||
if host is None:
|
||||
return
|
||||
|
||||
query_args = parse_qs(parsed_url.query)
|
||||
# Corner case handling
|
||||
if parsed_url.path.startswith("/file.xhtml"):
|
||||
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
|
||||
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
|
||||
new_doi = doi.rsplit("/", 1)[0]
|
||||
if new_doi == doi:
|
||||
# tough luck :( Avoid inifite recursion and exit.
|
||||
return
|
||||
return self.detect(new_doi)
|
||||
elif parsed_url.path.startswith("/api/access/datafile"):
|
||||
# Raw url pointing to a datafile is a typical output from an External Tool integration
|
||||
entity_id = os.path.basename(parsed_url.path)
|
||||
search_query = "q=entityId:" + entity_id + "&type=file"
|
||||
# Knowing the file identifier query search api to get parent dataset
|
||||
search_url = urlunparse(
|
||||
parsed_url._replace(path="/api/search", query=search_query)
|
||||
# At this point, we *know* this is a dataverse URL, because:
|
||||
# 1. The DOI resolved to a particular host (if using DOI)
|
||||
# 2. The host is in the list of known dataverse installations
|
||||
#
|
||||
# We don't know exactly what kind of dataverse object this is, but
|
||||
# that can be figured out during fetch as needed
|
||||
return url
|
||||
|
||||
def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str:
|
||||
"""
|
||||
Return the persistent_id (DOI) of a dataset that a given file_id (int or doi) belongs to
|
||||
"""
|
||||
if file_id.isdigit():
|
||||
# the file_id is an integer, rather than a persistent id (DOI)
|
||||
api_url = f"{base_url}/api/files/{file_id}?returnDatasetVersion=true"
|
||||
else:
|
||||
# the file_id is a doi itself
|
||||
api_url = f"{base_url}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true"
|
||||
|
||||
resp = self._request(api_url)
|
||||
if resp.status_code == 404:
|
||||
raise ValueError(f"File with id {file_id} not found in {base_url}")
|
||||
|
||||
resp.raise_for_status()
|
||||
|
||||
data = resp.json()["data"]
|
||||
return data["datasetVersion"]["datasetPersistentId"]
|
||||
|
||||
def parse_dataverse_url(self, url: str) -> Tuple[str, bool]:
|
||||
"""
|
||||
Parse the persistent id out of a dataverse URL
|
||||
|
||||
persistent_id can point to either a dataset or a file. The second return
|
||||
value is False if we know that the persistent id is a file or a dataset,
|
||||
and True if it is ambiguous.
|
||||
|
||||
Raises a ValueError if we can not parse the url
|
||||
"""
|
||||
parsed_url = urlparse(url)
|
||||
path = parsed_url.path
|
||||
qs = parse_qs(parsed_url.query)
|
||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
|
||||
is_ambiguous = False
|
||||
# https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
|
||||
if path.startswith("/citation"):
|
||||
is_ambiguous = True
|
||||
persistent_id = qs["persistentId"][0]
|
||||
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
|
||||
elif path.startswith("/dataset.xhtml"):
|
||||
# https://dataverse.harvard.edu/api/access/datafile/3323458
|
||||
persistent_id = qs["persistentId"][0]
|
||||
elif path.startswith("/api/access/datafile"):
|
||||
# What we have here is an entity id, which we can use to get a persistentId
|
||||
file_id = os.path.basename(path)
|
||||
persistent_id = self.get_dataset_id_from_file_id(base_url, file_id)
|
||||
elif parsed_url.path.startswith("/file.xhtml"):
|
||||
file_persistent_id = qs["persistentId"][0]
|
||||
persistent_id = self.get_dataset_id_from_file_id(
|
||||
base_url, file_persistent_id
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Could not determine persistent id for dataverse URL {url}"
|
||||
)
|
||||
self.log.debug("Querying Dataverse: " + search_url)
|
||||
data = self.urlopen(search_url).json()["data"]
|
||||
if data["count_in_response"] != 1:
|
||||
self.log.debug(
|
||||
f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n"
|
||||
)
|
||||
return
|
||||
|
||||
self.record_id = deep_get(data, "items.0.dataset_persistent_id")
|
||||
elif (
|
||||
parsed_url.path.startswith("/dataset.xhtml")
|
||||
and "persistentId" in query_args
|
||||
):
|
||||
self.record_id = deep_get(query_args, "persistentId.0")
|
||||
return persistent_id, is_ambiguous
|
||||
|
||||
if hasattr(self, "record_id"):
|
||||
return {"record": self.record_id, "host": host}
|
||||
def get_datafiles(self, url: str) -> List[dict]:
|
||||
"""
|
||||
Return a list of dataFiles for given persistent_id
|
||||
|
||||
Supports the following *dataset* URL styles:
|
||||
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
|
||||
- /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
|
||||
|
||||
Supports the following *file* URL styles (entire dataset file belongs to will be fetched):
|
||||
- /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458
|
||||
- /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
|
||||
- /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ
|
||||
|
||||
If a URL can not be parsed, throw an exception
|
||||
"""
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
|
||||
persistent_id, is_ambiguous = self.parse_dataverse_url(url)
|
||||
|
||||
dataset_api_url = (
|
||||
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
|
||||
)
|
||||
resp = self._request(dataset_api_url, headers={"accept": "application/json"})
|
||||
if resp.status_code == 404 and is_ambiguous:
|
||||
# It's possible this is a *file* persistent_id, not a dataset one
|
||||
persistent_id = self.get_dataset_id_from_file_id(base_url, persistent_id)
|
||||
dataset_api_url = (
|
||||
f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}"
|
||||
)
|
||||
resp = self._request(
|
||||
dataset_api_url, headers={"accept": "application/json"}
|
||||
)
|
||||
|
||||
if resp.status_code == 404:
|
||||
# This persistent id is just not here
|
||||
raise ValueError(f"{persistent_id} on {base_url} is not found")
|
||||
|
||||
# We already handled 404, raise error for everything else
|
||||
resp.raise_for_status()
|
||||
|
||||
# We know the exact persistent_id of the dataset we fetched now
|
||||
# Save it for use as content_id
|
||||
self.persistent_id = persistent_id
|
||||
|
||||
data = resp.json()["data"]
|
||||
|
||||
return data["latestVersion"]["files"]
|
||||
|
||||
def fetch(self, spec, output_dir, yield_output=False):
|
||||
"""Fetch and unpack a Dataverse dataset."""
|
||||
record_id = spec["record"]
|
||||
host = spec["host"]
|
||||
url = spec
|
||||
parsed_url = urlparse(url)
|
||||
# FIXME: Support determining API URL better
|
||||
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
|
||||
yield f"Fetching Dataverse record {record_id}.\n"
|
||||
url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}'
|
||||
yield f"Fetching Dataverse record {url}.\n"
|
||||
|
||||
resp = self.urlopen(url, headers={"accept": "application/json"})
|
||||
record = resp.json()["data"]
|
||||
|
||||
for fobj in deep_get(record, "latestVersion.files"):
|
||||
for fobj in self.get_datafiles(url):
|
||||
file_url = (
|
||||
# without format=original you get the preservation format (plain text, tab separated)
|
||||
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
|
||||
f'{base_url}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
|
||||
)
|
||||
filename = fobj["label"]
|
||||
original_filename = fobj["dataFile"].get("originalFileName", None)
|
||||
|
@ -128,5 +213,9 @@ class Dataverse(DoiProvider):
|
|||
|
||||
@property
|
||||
def content_id(self):
|
||||
"""The Dataverse persistent identifier."""
|
||||
return self.record_id
|
||||
"""
|
||||
The Dataverse persistent identifier.
|
||||
|
||||
Only valid if called after a succesfull fetch
|
||||
"""
|
||||
return self.persistent_id
|
||||
|
|
|
@ -46,21 +46,28 @@ class DoiProvider(ContentProvider):
|
|||
# Transform a DOI to a URL
|
||||
# If not a doi, assume we have a URL and return
|
||||
if is_doi(doi):
|
||||
doi = normalize_doi(doi)
|
||||
normalized_doi = normalize_doi(doi)
|
||||
|
||||
try:
|
||||
resp = self._request(f"https://doi.org/{doi}")
|
||||
resp.raise_for_status()
|
||||
except HTTPError as e:
|
||||
# If the DOI doesn't exist, just return URL
|
||||
if e.response.status_code == 404:
|
||||
return doi
|
||||
# Reraise any other errors because if the DOI service is down (or
|
||||
# we hit a rate limit) we don't want to silently continue to the
|
||||
# default Git provider as this leads to a misleading error.
|
||||
self.log.error(f"DOI {doi} does not resolve: {e}")
|
||||
# Use the doi.org resolver API
|
||||
# documented at https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation#5-proxy-server-rest-api
|
||||
req_url = f"https://doi.org/api/handles/{normalized_doi}"
|
||||
resp = self._request(req_url)
|
||||
if resp.status_code == 404:
|
||||
# Not a doi, return what we were passed in
|
||||
return doi
|
||||
elif resp.status_code == 200:
|
||||
data = resp.json()
|
||||
# Pick the first URL we find from the doi response
|
||||
for v in data["values"]:
|
||||
if v["type"] == "URL":
|
||||
return v["data"]["value"]
|
||||
|
||||
# No URLs found for this doi, what do we do?
|
||||
self.log.error("DOI {normalized_doi} doesn't point to any URLs")
|
||||
return doi
|
||||
else:
|
||||
# If we get any other status codes, raise error
|
||||
raise
|
||||
return resp.url
|
||||
else:
|
||||
# Just return what is actulally just a URL
|
||||
return doi
|
||||
|
|
|
@ -0,0 +1,143 @@
|
|||
import hashlib
|
||||
import os
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import pytest
|
||||
|
||||
from repo2docker.contentproviders import Dataverse
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("doi", "resolved"),
|
||||
[
|
||||
(
|
||||
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||
),
|
||||
(
|
||||
"10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||
),
|
||||
(
|
||||
"10.7910/DVN/TJCLKP",
|
||||
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
|
||||
),
|
||||
(
|
||||
"https://dataverse.harvard.edu/api/access/datafile/3323458",
|
||||
"https://dataverse.harvard.edu/api/access/datafile/3323458",
|
||||
),
|
||||
(
|
||||
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
|
||||
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
|
||||
),
|
||||
("/some/random/string", None),
|
||||
("https://example.com/path/here", None),
|
||||
# Non dataverse DOIs
|
||||
("https://doi.org/10.21105/joss.01277", None),
|
||||
],
|
||||
)
|
||||
def test_detect(doi, resolved):
|
||||
assert Dataverse().detect(doi) == resolved
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("url", "persistent_id", "is_ambiguous"),
|
||||
[
|
||||
(
|
||||
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||
"doi:10.7910/DVN/6ZXAGT",
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
|
||||
"doi:10.7910/DVN/TJCLKP",
|
||||
True,
|
||||
),
|
||||
(
|
||||
"https://dataverse.harvard.edu/api/access/datafile/3323458",
|
||||
"doi:10.7910/DVN/3MJ7IR",
|
||||
False,
|
||||
),
|
||||
(
|
||||
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
|
||||
"hdl:11529/10016",
|
||||
False,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_get_persistent_id(url, persistent_id, is_ambiguous):
|
||||
assert Dataverse().parse_dataverse_url(url) == (persistent_id, is_ambiguous)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("specs", "md5tree"),
|
||||
[
|
||||
(
|
||||
(
|
||||
"doi:10.7910/DVN/TJCLKP",
|
||||
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
|
||||
),
|
||||
{
|
||||
"data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694",
|
||||
"data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917",
|
||||
"code/language.py": "9d61582bcf497c83bbd1ed0eed3c772e",
|
||||
},
|
||||
),
|
||||
(
|
||||
(
|
||||
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||
),
|
||||
{
|
||||
"ArchaeoGLOBE-master/analysis/figures/1_response_distribution.png": "243c6a3dd66bc3c84102829b277ef333",
|
||||
"ArchaeoGLOBE-master/analysis/figures/2_trends_map_knowledge.png": "2ace6ae9d470dda6cf2f9f9a6588171a",
|
||||
"ArchaeoGLOBE-master/analysis/figures/3_trends_global.png": "63ccd0a7b2d20440cd8f418d4ee88c4d",
|
||||
"ArchaeoGLOBE-master/analysis/figures/4_consensus_transitions.png": "facfaedabeac77c4496d4b9e962a917f",
|
||||
"ArchaeoGLOBE-master/analysis/figures/5_ArchaeoGLOBE_HYDE_comparison.png": "8e002e4d50f179fc1808f562b1353588",
|
||||
"ArchaeoGLOBE-master/apt.txt": "b4224032da6c71d48f46c9b78fc6ed77",
|
||||
"ArchaeoGLOBE-master/analysis/archaeoglobe.pdf": "f575be4790efc963ef1bd40d097cc06d",
|
||||
"ArchaeoGLOBE-master/analysis/archaeoglobe.Rmd": "f37d5f7993fde9ebd64d16b20fc22905",
|
||||
"ArchaeoGLOBE-master/ArchaeoGLOBE.Rproj": "d0250e7918993bab1e707358fe5633e0",
|
||||
"ArchaeoGLOBE-master/CONDUCT.md": "f87ef290340322089c32b4e573d8f1e8",
|
||||
"ArchaeoGLOBE-master/.circleci/config.yml": "6eaa54073a682b3195d8fab3a9dd8344",
|
||||
"ArchaeoGLOBE-master/CONTRIBUTING.md": "b3a6abfc749dd155a3049f94a855bf9f",
|
||||
"ArchaeoGLOBE-master/DESCRIPTION": "745ef979494999e483987de72c0adfbd",
|
||||
"ArchaeoGLOBE-master/dockerfile": "aedce68e5a7d6e79cbb24c9cffeae593",
|
||||
"ArchaeoGLOBE-master/.binder/Dockerfile": "7564a41246ba99b60144afb1d3b6d7de",
|
||||
"ArchaeoGLOBE-master/.gitignore": "62c1482e4febbd35dc02fb7e2a31246b",
|
||||
"ArchaeoGLOBE-master/analysis/data/derived-data/hyde_crop_prop.RDS": "2aea7748b5586923b0de9d13af58e59d",
|
||||
"ArchaeoGLOBE-master/analysis/data/derived-data/kk_anthro_prop.RDS": "145a9e5dd2c95625626a720b52178b70",
|
||||
"ArchaeoGLOBE-master/LICENSE.md": "3aa9d41a92a57944bd4590e004898445",
|
||||
"ArchaeoGLOBE-master/analysis/data/derived-data/placeholder": "d41d8cd98f00b204e9800998ecf8427e",
|
||||
"ArchaeoGLOBE-master/.Rbuildignore": "df15e4fed49abd685b536fef4472b01f",
|
||||
"ArchaeoGLOBE-master/README.md": "0b0faabe580c4d76a0e0d64a4f54bca4",
|
||||
"ArchaeoGLOBE-master/analysis/data/derived-data/README.md": "547fd1a6e874f6178b1cf525b5b9ae72",
|
||||
"ArchaeoGLOBE-master/analysis/figures/S1_FHG_consensus.png": "d2584352e5442b33e4b23e361ca70fe1",
|
||||
"ArchaeoGLOBE-master/analysis/figures/S2_EXAG_consensus.png": "513eddfdad01fd01a20263a55ca6dbe3",
|
||||
"ArchaeoGLOBE-master/analysis/figures/S3_INAG_consensus.png": "b16ba0ecd21b326f873209a7e55a8deb",
|
||||
"ArchaeoGLOBE-master/analysis/figures/S4_PAS_consensus.png": "05695f9412337a00c1cb6d1757d0ec5c",
|
||||
"ArchaeoGLOBE-master/analysis/figures/S5_URBAN_consensus.png": "10119f7495d3b8e7ad7f8a0770574f15",
|
||||
"ArchaeoGLOBE-master/analysis/figures/S6_trends_map_landuse.png": "b1db7c97f39ccfc3a9e094c3e6307af0",
|
||||
"ArchaeoGLOBE-master/analysis/figures/S7_ArchaeoGLOBE_KK10_comparison.png": "30341748324f5f66acadb34c114c3e9d",
|
||||
},
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_fetch(specs: list[str], md5tree):
|
||||
dv = Dataverse()
|
||||
|
||||
for spec in specs:
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in dv.fetch(dv.detect(spec), d):
|
||||
output.append(l)
|
||||
|
||||
# Verify md5 sum of the files we expect to find
|
||||
# We are using md5 instead of something more secure because that is what
|
||||
# dataverse itself uses
|
||||
for subpath, expected_sha in md5tree.items():
|
||||
with open(os.path.join(d, subpath), "rb") as f:
|
||||
h = hashlib.md5()
|
||||
h.update(f.read())
|
||||
assert h.hexdigest() == expected_sha
|
|
@ -1,160 +0,0 @@
|
|||
import json
|
||||
import os
|
||||
import re
|
||||
from io import BytesIO
|
||||
from tempfile import TemporaryDirectory
|
||||
from unittest.mock import patch
|
||||
from urllib.parse import urlsplit
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
import pytest
|
||||
|
||||
from repo2docker.contentproviders import Dataverse
|
||||
|
||||
test_dv = Dataverse()
|
||||
harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")
|
||||
cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")
|
||||
test_hosts = [
|
||||
(
|
||||
[
|
||||
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||
"10.7910/DVN/6ZXAGT",
|
||||
"https://dataverse.harvard.edu/api/access/datafile/3323458",
|
||||
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
|
||||
],
|
||||
[
|
||||
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"},
|
||||
{"host": cimmyt_dv, "record": "hdl:11529/10016"},
|
||||
],
|
||||
)
|
||||
]
|
||||
doi_responses = {
|
||||
"https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": (
|
||||
"https://dataverse.harvard.edu/file.xhtml"
|
||||
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"
|
||||
),
|
||||
"https://doi.org/10.7910/DVN/6ZXAGT": (
|
||||
"https://dataverse.harvard.edu/dataset.xhtml"
|
||||
"?persistentId=doi:10.7910/DVN/6ZXAGT"
|
||||
),
|
||||
"https://dataverse.harvard.edu/api/access/datafile/3323458": (
|
||||
"https://dataverse.harvard.edu/api/access/datafile/3323458"
|
||||
),
|
||||
"https://doi.org/10.21105/joss.01277": (
|
||||
"https://joss.theoj.org/papers/10.21105/joss.01277"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("test_input, expected", test_hosts)
|
||||
def test_detect_dataverse(test_input, expected, requests_mock):
|
||||
def doi_resolver(req, context):
|
||||
resp = doi_responses.get(req.url)
|
||||
# doi responses are redirects
|
||||
if resp is not None:
|
||||
context.status_code = 302
|
||||
context.headers["Location"] = resp
|
||||
return resp
|
||||
|
||||
requests_mock.get(re.compile("https://"), json=doi_resolver)
|
||||
requests_mock.get(
|
||||
"https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file",
|
||||
json={
|
||||
"data": {
|
||||
"count_in_response": 1,
|
||||
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
assert requests_mock.call_count == 0
|
||||
# valid Dataverse DOIs trigger this content provider
|
||||
assert Dataverse().detect(test_input[0]) == expected[0]
|
||||
# 4: doi resolution (302), File, doi resolution (302), then dataset
|
||||
assert requests_mock.call_count == 4
|
||||
requests_mock.reset_mock()
|
||||
|
||||
assert Dataverse().detect(test_input[1]) == expected[0]
|
||||
# 2: doi (302), dataset
|
||||
assert requests_mock.call_count == 2
|
||||
requests_mock.reset_mock()
|
||||
|
||||
assert Dataverse().detect(test_input[2]) == expected[0]
|
||||
# 1: datafile (search dataverse for the file id)
|
||||
assert requests_mock.call_count == 1
|
||||
requests_mock.reset_mock()
|
||||
|
||||
assert Dataverse().detect(test_input[3]) == expected[1]
|
||||
requests_mock.reset_mock()
|
||||
|
||||
# Don't trigger the Dataverse content provider
|
||||
assert Dataverse().detect("/some/path/here") is None
|
||||
assert Dataverse().detect("https://example.com/path/here") is None
|
||||
# don't handle DOIs that aren't from Dataverse
|
||||
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dv_files(tmpdir):
|
||||
f1 = tmpdir.join("some-file.txt")
|
||||
f1.write("some content")
|
||||
|
||||
f2 = tmpdir.mkdir("directory").join("some-other-file.txt")
|
||||
f2.write("some other content")
|
||||
|
||||
f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt")
|
||||
f3.write("yet another content")
|
||||
|
||||
return [f1, f2, f3]
|
||||
|
||||
|
||||
def test_dataverse_fetch(dv_files, requests_mock):
|
||||
mock_response = {
|
||||
"data": {
|
||||
"latestVersion": {
|
||||
"files": [
|
||||
{"dataFile": {"id": 1}, "label": "some-file.txt"},
|
||||
{
|
||||
"dataFile": {"id": 2},
|
||||
"label": "some-other-file.txt",
|
||||
"directoryLabel": "directory",
|
||||
},
|
||||
{
|
||||
"dataFile": {"id": 3},
|
||||
"label": "the-other-file.txt",
|
||||
"directoryLabel": "directory/subdirectory",
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
|
||||
|
||||
def mock_filecontent(req, context):
|
||||
parts = urlsplit(req.url)
|
||||
file_no = int(parts.path.split("/")[-1]) - 1
|
||||
return open(dv_files[file_no], "rb").read()
|
||||
|
||||
requests_mock.get(
|
||||
"https://dataverse.harvard.edu/api/datasets/"
|
||||
":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT",
|
||||
json=mock_response,
|
||||
)
|
||||
requests_mock.get(
|
||||
re.compile("https://dataverse.harvard.edu/api/access/datafile"),
|
||||
content=mock_filecontent,
|
||||
)
|
||||
|
||||
dv = Dataverse()
|
||||
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in dv.fetch(spec, d):
|
||||
output.append(l)
|
||||
unpacked_files = set(os.listdir(d))
|
||||
expected = {"directory", "some-file.txt"}
|
||||
assert expected == unpacked_files
|
||||
assert os.path.isfile(
|
||||
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
|
||||
)
|
|
@ -33,7 +33,7 @@ def test_url_headers(requests_mock):
|
|||
@pytest.mark.parametrize(
|
||||
"requested_doi, expected",
|
||||
[
|
||||
("10.5281/zenodo.3242074", "https://zenodo.org/records/3242074"),
|
||||
("10.5281/zenodo.3242074", "https://zenodo.org/record/3242074"),
|
||||
# Unresolving DOI:
|
||||
("10.1/1234", "10.1/1234"),
|
||||
],
|
||||
|
|
|
@ -21,16 +21,9 @@ test_content_ids = [
|
|||
|
||||
|
||||
@pytest.mark.parametrize("link,expected", test_content_ids)
|
||||
def test_content_id(link, expected, requests_mock):
|
||||
def mocked_get(req, context):
|
||||
if req.url.startswith("https://doi.org"):
|
||||
context.status_code = 302
|
||||
context.headers["Location"] = link
|
||||
return link
|
||||
|
||||
requests_mock.get(re.compile("https://"), text=mocked_get)
|
||||
def test_content_id(link, expected):
|
||||
fig = Figshare()
|
||||
fig.detect("10.6084/m9.figshare.9782777")
|
||||
fig.detect(link)
|
||||
assert fig.content_id == expected
|
||||
|
||||
|
||||
|
|
|
@ -34,20 +34,14 @@ hydroshare_data = {
|
|||
}
|
||||
|
||||
|
||||
def test_content_id(requests_mock):
|
||||
requests_mock.get(re.compile("https://"), json=hydroshare_data)
|
||||
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
|
||||
|
||||
def test_content_id():
|
||||
hydro = Hydroshare()
|
||||
|
||||
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
|
||||
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
|
||||
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1585005408"
|
||||
|
||||
|
||||
def test_detect_hydroshare(requests_mock):
|
||||
requests_mock.get(re.compile("https://"), json=hydroshare_data)
|
||||
requests_mock.get(re.compile("https://doi.org"), json=doi_resolver)
|
||||
|
||||
def test_detect_hydroshare():
|
||||
# valid Hydroshare DOIs trigger this content provider
|
||||
expected = {
|
||||
"host": {
|
||||
|
@ -59,7 +53,7 @@ def test_detect_hydroshare(requests_mock):
|
|||
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
|
||||
},
|
||||
"resource": "b8f6eae9d89241cf8b5904033460af61",
|
||||
"version": "1569427757",
|
||||
"version": "1585005408",
|
||||
}
|
||||
|
||||
assert (
|
||||
|
@ -68,16 +62,10 @@ def test_detect_hydroshare(requests_mock):
|
|||
)
|
||||
== expected
|
||||
)
|
||||
# assert a call to urlopen was called to fetch version
|
||||
assert requests_mock.call_count == 1
|
||||
requests_mock.reset_mock()
|
||||
|
||||
assert (
|
||||
Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected
|
||||
)
|
||||
# assert 3 calls were made, 2 to resolve the DOI (302 + 200) and another to fetch the version
|
||||
assert requests_mock.call_count == 3
|
||||
requests_mock.reset_mock()
|
||||
|
||||
assert (
|
||||
Hydroshare().detect(
|
||||
|
@ -85,9 +73,6 @@ def test_detect_hydroshare(requests_mock):
|
|||
)
|
||||
== expected
|
||||
)
|
||||
# assert 3 more calls were made, 2 to resolve the DOI and another to fetch the version
|
||||
assert requests_mock.call_count == 3
|
||||
requests_mock.reset_mock()
|
||||
|
||||
# Don't trigger the Hydroshare content provider
|
||||
assert Hydroshare().detect("/some/path/here") is None
|
||||
|
|
|
@ -21,18 +21,7 @@ doi_responses = {
|
|||
}
|
||||
|
||||
|
||||
def doi_resolver(req, context):
|
||||
resp = doi_responses.get(req.url)
|
||||
# doi responses are redirects
|
||||
if resp is not None:
|
||||
context.status_code = 302
|
||||
context.headers["Location"] = resp
|
||||
return resp
|
||||
|
||||
|
||||
def test_content_id(requests_mock):
|
||||
requests_mock.get(re.compile("https://"), json=doi_resolver)
|
||||
|
||||
def test_content_id():
|
||||
zen = Zenodo()
|
||||
zen.detect("10.5281/zenodo.3232985")
|
||||
assert zen.content_id == "3232985"
|
||||
|
@ -60,15 +49,11 @@ test_hosts = [
|
|||
|
||||
|
||||
@pytest.mark.parametrize("test_input,expected", test_hosts)
|
||||
def test_detect_zenodo(test_input, expected, requests_mock):
|
||||
requests_mock.get(re.compile("https://"), json=doi_resolver)
|
||||
def test_detect_zenodo(test_input, expected):
|
||||
# valid Zenodo DOIs trigger this content provider
|
||||
assert Zenodo().detect(test_input[0]) == expected
|
||||
assert Zenodo().detect(test_input[1]) == expected
|
||||
assert Zenodo().detect(test_input[2]) == expected
|
||||
# only two of the three calls above have to resolve a DOI (2 req per doi resolution)
|
||||
assert requests_mock.call_count == 4
|
||||
requests_mock.reset_mock()
|
||||
|
||||
# Don't trigger the Zenodo content provider
|
||||
assert Zenodo().detect("/some/path/here") is None
|
||||
|
|
Ładowanie…
Reference in New Issue