Full generalization and support for CaltechDATA

pull/704/head
Tom Morrell 2019-06-18 15:11:06 -07:00
rodzic d2016bb722
commit 87f33e3b6a
4 zmienionych plików z 138 dodań i 20 usunięć

Wyświetl plik

@ -1,15 +1,16 @@
import os import os
import json import json
import shutil import shutil
import copy
from os import makedirs from os import makedirs
from os import path from os import path
from urllib.request import build_opener, urlopen, Request from urllib.request import build_opener, urlopen, Request
from zipfile import ZipFile, is_zipfile from zipfile import ZipFile, is_zipfile
from idutils import normalize_doi, is_doi
from .base import ContentProvider from .base import ContentProvider
from ..utils import copytree from ..utils import copytree
from ..utils import normalize_doi, is_doi
from .. import __version__ from .. import __version__
@ -40,6 +41,24 @@ class Zenodo(ContentProvider):
else: else:
return doi return doi
def _getfromdict(self, datadict, dotpath):
# Use a dotpath (string separated by periods)
# to access vaules in a dictionary
# data.files.0 returns value at dataDict[data][files][0]
split = dotpath.split(".")
# We check if we have any digits and convert these to
# ints for list access
mapList = []
for s in split:
if s.isdigit():
mapList.append(int(s))
else:
mapList.append(s)
values = copy.deepcopy(datadict)
for k in mapList:
values = values[k]
return values
def detect(self, doi, ref=None, extra_args=None): def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Zenodo/Invenio record""" """Trigger this provider for things that resolve to a Zenodo/Invenio record"""
# We need the hostname (url where records are), api url (for metadata), # We need the hostname (url where records are), api url (for metadata),
@ -51,7 +70,7 @@ class Zenodo(ContentProvider):
"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"], "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
"api": "https://zenodo.org/api/records/", "api": "https://zenodo.org/api/records/",
"filepath": "files", "filepath": "files",
"filename": "files.key", "filename": "filename",
"download": "links.download", "download": "links.download",
"type": "metadata.upload_type", "type": "metadata.upload_type",
}, },
@ -61,8 +80,9 @@ class Zenodo(ContentProvider):
"http://data.caltech.edu/records/", "http://data.caltech.edu/records/",
], ],
"api": "https://data.caltech.edu/api/record/", "api": "https://data.caltech.edu/api/record/",
"filepath": "files", "filepath": "metadata.electronic_location_and_access",
"filename": "electronic_location_and_access.electronic_name.0", "filename": "electronic_name.0",
"download": "uniform_resource_identifier",
"type": "metadata.resourceType.resourceTypeGeneral", "type": "metadata.resourceType.resourceTypeGeneral",
}, },
] ]
@ -91,8 +111,8 @@ class Zenodo(ContentProvider):
def _fetch(file_ref, unzip=False): def _fetch(file_ref, unzip=False):
# the assumption is that `unzip=True` means that this is the only # the assumption is that `unzip=True` means that this is the only
# file related to the zenodo record # file related to the zenodo record
with self._urlopen(file_ref["links"]["download"]) as src: with self._urlopen(self._getfromdict(file_ref, host["download"])) as src:
fname = file_ref["filename"] fname = self._getfromdict(file_ref, host["filename"])
if path.dirname(fname): if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname)) sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir): if not path.exists(sub_dir):
@ -126,9 +146,13 @@ class Zenodo(ContentProvider):
copytree(path.join(output_dir, d), output_dir) copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d)) shutil.rmtree(path.join(output_dir, d))
is_software = record["metadata"]["upload_type"] == "software" is_software = self._getfromdict(record, host["type"]).lower() == "software"
only_one_file = len(record["files"]) == 1 files = self._getfromdict(record, host["filepath"])
for file_ref in record["files"]:
#
only_one_file = len(files) == 1
for file_ref in files:
for line in _fetch(file_ref, unzip=is_software and only_one_file): for line in _fetch(file_ref, unzip=is_software and only_one_file):
yield line yield line

Wyświetl plik

@ -391,3 +391,24 @@ def copytree(
if errors: if errors:
raise Error(errors) raise Error(errors)
return dst return dst
# Code segments below from idutils (https://github.com/inveniosoftware/idutils)
# Copyright (C) 2015-2018 CERN.
# Copyright (C) 2018 Alan Rubin.
# Licensed under BSD-3-Clause license
doi_regexp = re.compile(
"(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$", flags=re.I
)
"""See http://en.wikipedia.org/wiki/Digital_object_identifier."""
def is_doi(val):
"""Test if argument is a DOI."""
return doi_regexp.match(val)
def normalize_doi(val):
"""Normalize a DOI."""
m = doi_regexp.match(val)
return m.group(2)

Wyświetl plik

@ -20,7 +20,6 @@ setup(
"ruamel.yaml>=0.15", "ruamel.yaml>=0.15",
"toml", "toml",
"semver", "semver",
"idutils",
], ],
python_requires=">=3.5", python_requires=">=3.5",
author="Project Jupyter Contributors", author="Project Jupyter Contributors",

Wyświetl plik

@ -24,13 +24,47 @@ def test_detect():
with patch.object(Zenodo, "_urlopen") as fake_urlopen: with patch.object(Zenodo, "_urlopen") as fake_urlopen:
fake_urlopen.return_value.url = "https://zenodo.org/record/3232985" fake_urlopen.return_value.url = "https://zenodo.org/record/3232985"
# valid Zenodo DOIs trigger this content provider # valid Zenodo DOIs trigger this content provider
assert Zenodo().detect("10.5281/zenodo.3232985") == {"record": "3232985"} assert Zenodo().detect("10.5281/zenodo.3232985") == {
assert Zenodo().detect("https://doi.org/10.5281/zenodo.3232985") == { "host": {
"record": "3232985" "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "3232985",
} }
assert Zenodo().detect("https://zenodo.org/record/3232985") == { assert (
"record": "3232985" Zenodo().detect("https://doi.org/10.5281/zenodo.3232985")["record"]
== "3232985"
)
assert (
Zenodo().detect("https://zenodo.org/record/3232985")["record"] == "3232985"
)
# only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 2
with patch.object(Zenodo, "_urlopen") as fake_urlopen:
fake_urlopen.return_value.url = "https://data.caltech.edu/records/1235"
# valid CaltechDATA DOIs trigger this content provider
assert Zenodo().detect("10.22002/d1.1235") == {
"hots": {
"hostname": [
"https://data.caltech.edu/records/",
"http://data.caltech.edu/records/",
],
"api": "https://data.caltech.edu/api/record/",
"filepath": "metadata.electronic_location_and_access",
"filename": "electronic_name.0",
"download": "uniform_resource_identifier",
"type": "metadata.resourceType.resourceTypeGeneral",
},
"record": "1235",
} }
assert Zenodo().detect("https://doi.org/10.22002/d1.1235")["record"] == "1235"
assert Zenodo().detect("https://zenodo.org/record/3232985")["record"] == "1235"
# only two of the three calls above have to resolve a DOI # only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 2 assert fake_urlopen.call_count == 2
@ -83,10 +117,24 @@ def test_fetch_software_from_github_archive():
with patch.object(Zenodo, "_urlopen", new=mock_urlopen): with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
zen = Zenodo() zen = Zenodo()
spec = {
"host": {
"hostname": [
"https://zenodo.org/record/",
"http://zenodo.org/record/",
],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "1234",
}
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
output = [] output = []
for l in zen.fetch({"record": "1234"}, d): for l in zen.fetch(spec, d):
output.append(l) output.append(l)
unpacked_files = set(os.listdir(d)) unpacked_files = set(os.listdir(d))
@ -123,9 +171,22 @@ def test_fetch_software():
with patch.object(Zenodo, "_urlopen", new=mock_urlopen): with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
zen = Zenodo() zen = Zenodo()
spec = spec = {
"host": {
"hostname": [
"https://zenodo.org/record/",
"http://zenodo.org/record/",
],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "1234",
}
output = [] output = []
for l in zen.fetch({"record": "1234"}, d): for l in zen.fetch(spec, d):
output.append(l) output.append(l)
unpacked_files = set(os.listdir(d)) unpacked_files = set(os.listdir(d))
@ -164,9 +225,22 @@ def test_fetch_data():
with patch.object(Zenodo, "_urlopen", new=mock_urlopen): with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
with TemporaryDirectory() as d: with TemporaryDirectory() as d:
zen = Zenodo() zen = Zenodo()
spec = {
"host": {
"hostname": [
"https://zenodo.org/record/",
"http://zenodo.org/record/",
],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
"record": "1234",
}
output = [] output = []
for l in zen.fetch({"record": "1234"}, d): for l in zen.fetch(spec, d):
output.append(l) output.append(l)
unpacked_files = set(os.listdir(d)) unpacked_files = set(os.listdir(d))