kopia lustrzana https://github.com/jupyterhub/repo2docker
[MRG] Generalize Zenodo content provider to support other Invenio repositories (#704)
[MRG] Generalize Zenodo content provider to support other Invenio repositoriespull/712/head
commit
70819410f1
|
@ -5,10 +5,12 @@ import shutil
|
||||||
from os import makedirs
|
from os import makedirs
|
||||||
from os import path
|
from os import path
|
||||||
from urllib.request import build_opener, urlopen, Request
|
from urllib.request import build_opener, urlopen, Request
|
||||||
|
from urllib.error import HTTPError
|
||||||
from zipfile import ZipFile, is_zipfile
|
from zipfile import ZipFile, is_zipfile
|
||||||
|
|
||||||
from .base import ContentProvider
|
from .base import ContentProvider
|
||||||
from ..utils import copytree
|
from ..utils import copytree, deep_get
|
||||||
|
from ..utils import normalize_doi, is_doi
|
||||||
from .. import __version__
|
from .. import __version__
|
||||||
|
|
||||||
|
|
||||||
|
@ -28,39 +30,64 @@ class Zenodo(ContentProvider):
|
||||||
|
|
||||||
return urlopen(req)
|
return urlopen(req)
|
||||||
|
|
||||||
|
def _doi2url(self, doi):
|
||||||
|
# Transform a DOI to a URL
|
||||||
|
# If not a doi, assume we have a URL and return
|
||||||
|
if is_doi(doi):
|
||||||
|
doi = normalize_doi(doi)
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = self._urlopen("https://doi.org/{}".format(doi))
|
||||||
|
# If the DOI doesn't resolve, just return URL
|
||||||
|
except HTTPError:
|
||||||
|
return doi
|
||||||
|
return resp.url
|
||||||
|
else:
|
||||||
|
# Just return what is actulally just a URL
|
||||||
|
return doi
|
||||||
|
|
||||||
def detect(self, doi, ref=None, extra_args=None):
|
def detect(self, doi, ref=None, extra_args=None):
|
||||||
"""Trigger this provider for things that resolve to a Zenodo record"""
|
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
|
||||||
# To support Zenodo instances not hosted at zenodo.org we need to
|
# We need the hostname (url where records are), api url (for metadata),
|
||||||
# start maintaining a list of known DOI prefixes and their hostname.
|
# filepath (path to files in metadata), filename (path to filename in
|
||||||
# We should also change to returning a complete `record_url` that
|
# metadata), download (path to file download URL), and type (path to item type in metadata)
|
||||||
# fetch() can use instead of constructing a URL there
|
hosts = [
|
||||||
doi = doi.lower()
|
{
|
||||||
# 10.5281 is the Zenodo DOI prefix
|
"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
|
||||||
if doi.startswith("10.5281/"):
|
"api": "https://zenodo.org/api/records/",
|
||||||
resp = self._urlopen("https://doi.org/{}".format(doi))
|
"filepath": "files",
|
||||||
self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
|
"filename": "filename",
|
||||||
return {"record": self.record_id}
|
"download": "links.download",
|
||||||
|
"type": "metadata.upload_type",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"hostname": [
|
||||||
|
"https://data.caltech.edu/records/",
|
||||||
|
"http://data.caltech.edu/records/",
|
||||||
|
],
|
||||||
|
"api": "https://data.caltech.edu/api/record/",
|
||||||
|
"filepath": "metadata.electronic_location_and_access",
|
||||||
|
"filename": "electronic_name.0",
|
||||||
|
"download": "uniform_resource_identifier",
|
||||||
|
"type": "metadata.resourceType.resourceTypeGeneral",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
elif doi.startswith("https://doi.org/10.5281/") or doi.startswith(
|
url = self._doi2url(doi)
|
||||||
"http://doi.org/10.5281/"
|
|
||||||
):
|
|
||||||
resp = self._urlopen(doi)
|
|
||||||
self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
|
|
||||||
return {"record": self.record_id}
|
|
||||||
|
|
||||||
elif doi.startswith("https://zenodo.org/record/") or doi.startswith(
|
for host in hosts:
|
||||||
"http://zenodo.org/record/"
|
if any([url.startswith(s) for s in host["hostname"]]):
|
||||||
):
|
self.record_id = url.rsplit("/", maxsplit=1)[1]
|
||||||
self.record_id = doi.rsplit("/", maxsplit=1)[1]
|
return {"record": self.record_id, "host": host}
|
||||||
return {"record": self.record_id}
|
|
||||||
|
|
||||||
def fetch(self, spec, output_dir, yield_output=False):
|
def fetch(self, spec, output_dir, yield_output=False):
|
||||||
"""Fetch and unpack a Zenodo record"""
|
"""Fetch and unpack a Zenodo record"""
|
||||||
record_id = spec["record"]
|
record_id = spec["record"]
|
||||||
|
host = spec["host"]
|
||||||
|
|
||||||
yield "Fetching Zenodo record {}.\n".format(record_id)
|
yield "Fetching Zenodo record {}.\n".format(record_id)
|
||||||
req = Request(
|
req = Request(
|
||||||
"https://zenodo.org/api/records/{}".format(record_id),
|
"{}{}".format(host["api"], record_id),
|
||||||
headers={"accept": "application/json"},
|
headers={"accept": "application/json"},
|
||||||
)
|
)
|
||||||
resp = self._urlopen(req)
|
resp = self._urlopen(req)
|
||||||
|
@ -70,8 +97,8 @@ class Zenodo(ContentProvider):
|
||||||
def _fetch(file_ref, unzip=False):
|
def _fetch(file_ref, unzip=False):
|
||||||
# the assumption is that `unzip=True` means that this is the only
|
# the assumption is that `unzip=True` means that this is the only
|
||||||
# file related to the zenodo record
|
# file related to the zenodo record
|
||||||
with self._urlopen(file_ref["links"]["download"]) as src:
|
with self._urlopen(deep_get(file_ref, host["download"])) as src:
|
||||||
fname = file_ref["filename"]
|
fname = deep_get(file_ref, host["filename"])
|
||||||
if path.dirname(fname):
|
if path.dirname(fname):
|
||||||
sub_dir = path.join(output_dir, path.dirname(fname))
|
sub_dir = path.join(output_dir, path.dirname(fname))
|
||||||
if not path.exists(sub_dir):
|
if not path.exists(sub_dir):
|
||||||
|
@ -105,9 +132,10 @@ class Zenodo(ContentProvider):
|
||||||
copytree(path.join(output_dir, d), output_dir)
|
copytree(path.join(output_dir, d), output_dir)
|
||||||
shutil.rmtree(path.join(output_dir, d))
|
shutil.rmtree(path.join(output_dir, d))
|
||||||
|
|
||||||
is_software = record["metadata"]["upload_type"] == "software"
|
is_software = deep_get(record, host["type"]).lower() == "software"
|
||||||
only_one_file = len(record["files"]) == 1
|
files = deep_get(record, host["filepath"])
|
||||||
for file_ref in record["files"]:
|
only_one_file = len(files) == 1
|
||||||
|
for file_ref in files:
|
||||||
for line in _fetch(file_ref, unzip=is_software and only_one_file):
|
for line in _fetch(file_ref, unzip=is_software and only_one_file):
|
||||||
yield line
|
yield line
|
||||||
|
|
||||||
|
|
|
@ -391,3 +391,45 @@ def copytree(
|
||||||
if errors:
|
if errors:
|
||||||
raise Error(errors)
|
raise Error(errors)
|
||||||
return dst
|
return dst
|
||||||
|
|
||||||
|
|
||||||
|
def deep_get(dikt, path):
|
||||||
|
"""Get a value located in `path` from a nested dictionary.
|
||||||
|
|
||||||
|
Use a string separated by periods as the path to access
|
||||||
|
values in a nested dictionary:
|
||||||
|
|
||||||
|
deep_get(data, "data.files.0") == data["data"]["files"][0]
|
||||||
|
"""
|
||||||
|
value = dikt
|
||||||
|
for component in path.split("."):
|
||||||
|
if component.isdigit():
|
||||||
|
value = value[int(component)]
|
||||||
|
else:
|
||||||
|
value = value[component]
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
|
# doi_regexp, is_doi, and normalize_doi are from idutils (https://github.com/inveniosoftware/idutils)
|
||||||
|
# Copyright (C) 2015-2018 CERN.
|
||||||
|
# Copyright (C) 2018 Alan Rubin.
|
||||||
|
# Licensed under BSD-3-Clause license
|
||||||
|
doi_regexp = re.compile(
|
||||||
|
"(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$", flags=re.I
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def is_doi(val):
|
||||||
|
"""Returns None if val doesn't match pattern of a DOI.
|
||||||
|
http://en.wikipedia.org/wiki/Digital_object_identifier."""
|
||||||
|
print(type(val))
|
||||||
|
print(val)
|
||||||
|
return doi_regexp.match(val)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_doi(val):
|
||||||
|
"""Return just the DOI (e.g. 10.1234/jshd123)
|
||||||
|
from a val that could include a url or doi
|
||||||
|
(e.g. https://doi.org/10.1234/jshd123)"""
|
||||||
|
m = doi_regexp.match(val)
|
||||||
|
return m.group(2)
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
@ -20,18 +21,57 @@ def test_content_id():
|
||||||
assert zen.content_id == "3232985"
|
assert zen.content_id == "3232985"
|
||||||
|
|
||||||
|
|
||||||
def test_detect():
|
test_hosts = [
|
||||||
with patch.object(Zenodo, "_urlopen") as fake_urlopen:
|
(
|
||||||
fake_urlopen.return_value.url = "https://zenodo.org/record/3232985"
|
[
|
||||||
# valid Zenodo DOIs trigger this content provider
|
"https://zenodo.org/record/3232985",
|
||||||
assert Zenodo().detect("10.5281/zenodo.3232985") == {"record": "3232985"}
|
"10.5281/zenodo.3232985",
|
||||||
assert Zenodo().detect("https://doi.org/10.5281/zenodo.3232985") == {
|
"https://doi.org/10.5281/zenodo.3232985",
|
||||||
"record": "3232985"
|
],
|
||||||
}
|
{
|
||||||
assert Zenodo().detect("https://zenodo.org/record/3232985") == {
|
"host": {
|
||||||
"record": "3232985"
|
"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
|
||||||
}
|
"api": "https://zenodo.org/api/records/",
|
||||||
|
"filepath": "files",
|
||||||
|
"filename": "filename",
|
||||||
|
"download": "links.download",
|
||||||
|
"type": "metadata.upload_type",
|
||||||
|
},
|
||||||
|
"record": "3232985",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
[
|
||||||
|
"https://data.caltech.edu/records/1235",
|
||||||
|
"10.22002/d1.1235",
|
||||||
|
"https://doi.org/10.22002/d1.1235",
|
||||||
|
],
|
||||||
|
{
|
||||||
|
"host": {
|
||||||
|
"hostname": [
|
||||||
|
"https://data.caltech.edu/records/",
|
||||||
|
"http://data.caltech.edu/records/",
|
||||||
|
],
|
||||||
|
"api": "https://data.caltech.edu/api/record/",
|
||||||
|
"filepath": "metadata.electronic_location_and_access",
|
||||||
|
"filename": "electronic_name.0",
|
||||||
|
"download": "uniform_resource_identifier",
|
||||||
|
"type": "metadata.resourceType.resourceTypeGeneral",
|
||||||
|
},
|
||||||
|
"record": "1235",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("test_input,expected", test_hosts)
|
||||||
|
def test_detect_zenodo(test_input, expected):
|
||||||
|
with patch.object(Zenodo, "_urlopen") as fake_urlopen:
|
||||||
|
fake_urlopen.return_value.url = test_input[0]
|
||||||
|
# valid Zenodo DOIs trigger this content provider
|
||||||
|
assert Zenodo().detect(test_input[0]) == expected
|
||||||
|
assert Zenodo().detect(test_input[1]) == expected
|
||||||
|
assert Zenodo().detect(test_input[2]) == expected
|
||||||
# only two of the three calls above have to resolve a DOI
|
# only two of the three calls above have to resolve a DOI
|
||||||
assert fake_urlopen.call_count == 2
|
assert fake_urlopen.call_count == 2
|
||||||
|
|
||||||
|
@ -39,13 +79,12 @@ def test_detect():
|
||||||
# Don't trigger the Zenodo content provider
|
# Don't trigger the Zenodo content provider
|
||||||
assert Zenodo().detect("/some/path/here") is None
|
assert Zenodo().detect("/some/path/here") is None
|
||||||
assert Zenodo().detect("https://example.com/path/here") is None
|
assert Zenodo().detect("https://example.com/path/here") is None
|
||||||
# donn't handle DOIs that aren't from Zenodo
|
# don't handle DOIs that aren't from Zenodo
|
||||||
|
fake_urlopen.return_value.url = (
|
||||||
|
"http://joss.theoj.org/papers/10.21105/joss.01277"
|
||||||
|
)
|
||||||
assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None
|
assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None
|
||||||
|
|
||||||
# none of the examples are Zenodo like, so we should not attempt to
|
|
||||||
# resolve a DOI either
|
|
||||||
assert not fake_urlopen.called
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def zenodo_archive(prefix="a_directory"):
|
def zenodo_archive(prefix="a_directory"):
|
||||||
|
@ -83,10 +122,24 @@ def test_fetch_software_from_github_archive():
|
||||||
|
|
||||||
with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
|
with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
|
||||||
zen = Zenodo()
|
zen = Zenodo()
|
||||||
|
spec = {
|
||||||
|
"host": {
|
||||||
|
"hostname": [
|
||||||
|
"https://zenodo.org/record/",
|
||||||
|
"http://zenodo.org/record/",
|
||||||
|
],
|
||||||
|
"api": "https://zenodo.org/api/records/",
|
||||||
|
"filepath": "files",
|
||||||
|
"filename": "filename",
|
||||||
|
"download": "links.download",
|
||||||
|
"type": "metadata.upload_type",
|
||||||
|
},
|
||||||
|
"record": "1234",
|
||||||
|
}
|
||||||
|
|
||||||
with TemporaryDirectory() as d:
|
with TemporaryDirectory() as d:
|
||||||
output = []
|
output = []
|
||||||
for l in zen.fetch({"record": "1234"}, d):
|
for l in zen.fetch(spec, d):
|
||||||
output.append(l)
|
output.append(l)
|
||||||
|
|
||||||
unpacked_files = set(os.listdir(d))
|
unpacked_files = set(os.listdir(d))
|
||||||
|
@ -123,9 +176,22 @@ def test_fetch_software():
|
||||||
with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
|
with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
|
||||||
with TemporaryDirectory() as d:
|
with TemporaryDirectory() as d:
|
||||||
zen = Zenodo()
|
zen = Zenodo()
|
||||||
|
spec = spec = {
|
||||||
|
"host": {
|
||||||
|
"hostname": [
|
||||||
|
"https://zenodo.org/record/",
|
||||||
|
"http://zenodo.org/record/",
|
||||||
|
],
|
||||||
|
"api": "https://zenodo.org/api/records/",
|
||||||
|
"filepath": "files",
|
||||||
|
"filename": "filename",
|
||||||
|
"download": "links.download",
|
||||||
|
"type": "metadata.upload_type",
|
||||||
|
},
|
||||||
|
"record": "1234",
|
||||||
|
}
|
||||||
output = []
|
output = []
|
||||||
for l in zen.fetch({"record": "1234"}, d):
|
for l in zen.fetch(spec, d):
|
||||||
output.append(l)
|
output.append(l)
|
||||||
|
|
||||||
unpacked_files = set(os.listdir(d))
|
unpacked_files = set(os.listdir(d))
|
||||||
|
@ -164,9 +230,22 @@ def test_fetch_data():
|
||||||
with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
|
with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
|
||||||
with TemporaryDirectory() as d:
|
with TemporaryDirectory() as d:
|
||||||
zen = Zenodo()
|
zen = Zenodo()
|
||||||
|
spec = {
|
||||||
|
"host": {
|
||||||
|
"hostname": [
|
||||||
|
"https://zenodo.org/record/",
|
||||||
|
"http://zenodo.org/record/",
|
||||||
|
],
|
||||||
|
"api": "https://zenodo.org/api/records/",
|
||||||
|
"filepath": "files",
|
||||||
|
"filename": "filename",
|
||||||
|
"download": "links.download",
|
||||||
|
"type": "metadata.upload_type",
|
||||||
|
},
|
||||||
|
"record": "1234",
|
||||||
|
}
|
||||||
output = []
|
output = []
|
||||||
for l in zen.fetch({"record": "1234"}, d):
|
for l in zen.fetch(spec, d):
|
||||||
output.append(l)
|
output.append(l)
|
||||||
|
|
||||||
unpacked_files = set(os.listdir(d))
|
unpacked_files = set(os.listdir(d))
|
||||||
|
|
|
@ -82,3 +82,31 @@ def test_invalid_port_mapping(port_spec):
|
||||||
utils.validate_and_generate_port_mapping([port_spec])
|
utils.validate_and_generate_port_mapping([port_spec])
|
||||||
|
|
||||||
assert 'Port specification "{}"'.format(port_spec) in str(e.value)
|
assert 'Port specification "{}"'.format(port_spec) in str(e.value)
|
||||||
|
|
||||||
|
|
||||||
|
def test_deep_get():
|
||||||
|
data = {"data": {"files": [1, 2, 3]}}
|
||||||
|
assert utils.deep_get(data, "data.files.0") == 1
|
||||||
|
assert utils.deep_get(data, "data.files.1") == 2
|
||||||
|
assert utils.deep_get(data, "data.files") == [1, 2, 3]
|
||||||
|
assert utils.deep_get(data, "data") == {"files": [1, 2, 3]}
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_doi():
|
||||||
|
assert utils.is_doi("10.1234/jshd123") != None
|
||||||
|
assert utils.is_doi("10.1234/JSHD.8192") != None
|
||||||
|
assert utils.is_doi("doi.org/10.1234/jshd123") != None
|
||||||
|
assert utils.is_doi("http://doi.org/10.1234/jshd123") != None
|
||||||
|
assert utils.is_doi("https://doi.org/10.1234/jshd123") != None
|
||||||
|
assert utils.is_doi("http://dx.doi.org/10.1234/jshd123") != None
|
||||||
|
assert utils.is_doi("101234/jshd123") == None
|
||||||
|
assert utils.is_doi("https://mybinder.org") == None
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_doi():
|
||||||
|
assert utils.normalize_doi("10.1234/jshd123") == "10.1234/jshd123"
|
||||||
|
assert utils.normalize_doi("10.1234/JSHD.8192") == "10.1234/JSHD.8192"
|
||||||
|
assert utils.normalize_doi("doi.org/10.1234/jshd123") == "10.1234/jshd123"
|
||||||
|
assert utils.normalize_doi("http://doi.org/10.1234/jshd123") == "10.1234/jshd123"
|
||||||
|
assert utils.normalize_doi("https://doi.org/10.1234/jshd123") == "10.1234/jshd123"
|
||||||
|
assert utils.normalize_doi("http://dx.doi.org/10.1234/jshd123") == "10.1234/jshd123"
|
||||||
|
|
Ładowanie…
Reference in New Issue