Full generalization and support for CaltechDATA

pull/704/head
Tom Morrell 2019-06-18 15:11:06 -07:00
rodzic d2016bb722
commit 87f33e3b6a
4 zmienionych plików z 138 dodań i 20 usunięć

Wyświetl plik

@ -1,15 +1,16 @@
import os
import json
import shutil
import copy
from os import makedirs
from os import path
from urllib.request import build_opener, urlopen, Request
from zipfile import ZipFile, is_zipfile
from idutils import normalize_doi, is_doi
from .base import ContentProvider
from ..utils import copytree
from ..utils import normalize_doi, is_doi
from .. import __version__
@ -40,6 +41,24 @@ class Zenodo(ContentProvider):
else:
return doi
def _getfromdict(self, datadict, dotpath):
# Use a dotpath (string separated by periods)
# to access vaules in a dictionary
# data.files.0 returns value at dataDict[data][files][0]
split = dotpath.split(".")
# We check if we have any digits and convert these to
# ints for list access
mapList = []
for s in split:
if s.isdigit():
mapList.append(int(s))
else:
mapList.append(s)
values = copy.deepcopy(datadict)
for k in mapList:
values = values[k]
return values
def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
# We need the hostname (url where records are), api url (for metadata),
@ -51,7 +70,7 @@ class Zenodo(ContentProvider):
"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "files.key",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
@ -61,8 +80,9 @@ class Zenodo(ContentProvider):
"http://data.caltech.edu/records/",
],
"api": "https://data.caltech.edu/api/record/",
"filepath": "files",
"filename": "electronic_location_and_access.electronic_name.0",
"filepath": "metadata.electronic_location_and_access",
"filename": "electronic_name.0",
"download": "uniform_resource_identifier",
"type": "metadata.resourceType.resourceTypeGeneral",
},
]
@ -91,8 +111,8 @@ class Zenodo(ContentProvider):
def _fetch(file_ref, unzip=False):
# the assumption is that `unzip=True` means that this is the only
# file related to the zenodo record
with self._urlopen(file_ref["links"]["download"]) as src:
fname = file_ref["filename"]
with self._urlopen(self._getfromdict(file_ref, host["download"])) as src:
fname = self._getfromdict(file_ref, host["filename"])
if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
@ -126,9 +146,13 @@ class Zenodo(ContentProvider):
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d))
is_software = record["metadata"]["upload_type"] == "software"
only_one_file = len(record["files"]) == 1
for file_ref in record["files"]:
is_software = self._getfromdict(record, host["type"]).lower() == "software"
files = self._getfromdict(record, host["filepath"])
#
only_one_file = len(files) == 1
for file_ref in files:
for line in _fetch(file_ref, unzip=is_software and only_one_file):
yield line

Wyświetl plik

@ -391,3 +391,24 @@ def copytree(
if errors:
raise Error(errors)
return dst
# Code segments below from idutils (https://github.com/inveniosoftware/idutils)
# Copyright (C) 2015-2018 CERN.
# Copyright (C) 2018 Alan Rubin.
# Licensed under BSD-3-Clause license
doi_regexp = re.compile(
"(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$", flags=re.I
)
"""See http://en.wikipedia.org/wiki/Digital_object_identifier."""
def is_doi(val):
"""Test if argument is a DOI."""
return doi_regexp.match(val)
def normalize_doi(val):
"""Normalize a DOI."""
m = doi_regexp.match(val)
return m.group(2)

Wyświetl plik

@ -20,7 +20,6 @@ setup(
"ruamel.yaml>=0.15",
"toml",
"semver",
"idutils",
],
python_requires=">=3.5",
author="Project Jupyter Contributors",

Wyświetl plik

@ -24,13 +24,47 @@ def test_detect():
with patch.object(Zenodo, "_urlopen") as fake_urlopen:
fake_urlopen.return_value.url = "https://zenodo.org/record/3232985"
# valid Zenodo DOIs trigger this content provider
assert Zenodo().detect("10.5281/zenodo.3232985") == {"record": "3232985"}
assert Zenodo().detect("https://doi.org/10.5281/zenodo.3232985") == {
"record": "3232985"
assert Zenodo().detect("10.5281/zenodo.3232985") == {
"host": {
"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "3232985",
}
assert Zenodo().detect("https://zenodo.org/record/3232985") == {
"record": "3232985"
assert (
Zenodo().detect("https://doi.org/10.5281/zenodo.3232985")["record"]
== "3232985"
)
assert (
Zenodo().detect("https://zenodo.org/record/3232985")["record"] == "3232985"
)
# only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 2
with patch.object(Zenodo, "_urlopen") as fake_urlopen:
fake_urlopen.return_value.url = "https://data.caltech.edu/records/1235"
# valid CaltechDATA DOIs trigger this content provider
assert Zenodo().detect("10.22002/d1.1235") == {
"hots": {
"hostname": [
"https://data.caltech.edu/records/",
"http://data.caltech.edu/records/",
],
"api": "https://data.caltech.edu/api/record/",
"filepath": "metadata.electronic_location_and_access",
"filename": "electronic_name.0",
"download": "uniform_resource_identifier",
"type": "metadata.resourceType.resourceTypeGeneral",
},
"record": "1235",
}
assert Zenodo().detect("https://doi.org/10.22002/d1.1235")["record"] == "1235"
assert Zenodo().detect("https://zenodo.org/record/3232985")["record"] == "1235"
# only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 2
@ -83,10 +117,24 @@ def test_fetch_software_from_github_archive():
with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
zen = Zenodo()
spec = {
"host": {
"hostname": [
"https://zenodo.org/record/",
"http://zenodo.org/record/",
],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "1234",
}
with TemporaryDirectory() as d:
output = []
for l in zen.fetch({"record": "1234"}, d):
for l in zen.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
@ -123,9 +171,22 @@ def test_fetch_software():
with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
zen = Zenodo()
spec = spec = {
"host": {
"hostname": [
"https://zenodo.org/record/",
"http://zenodo.org/record/",
],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "1234",
}
output = []
for l in zen.fetch({"record": "1234"}, d):
for l in zen.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
@ -164,9 +225,22 @@ def test_fetch_data():
with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
with TemporaryDirectory() as d:
zen = Zenodo()
spec = {
"host": {
"hostname": [
"https://zenodo.org/record/",
"http://zenodo.org/record/",
],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "1234",
}
output = []
for l in zen.fetch({"record": "1234"}, d):
for l in zen.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))