From 87f33e3b6ae229dbfcea8e09abca450776c506de Mon Sep 17 00:00:00 2001 From: Tom Morrell Date: Tue, 18 Jun 2019 15:11:06 -0700 Subject: [PATCH] Full generalization and support for CaltechDATA --- repo2docker/contentproviders/zenodo.py | 42 +++++++--- repo2docker/utils.py | 21 +++++ setup.py | 1 - tests/unit/contentproviders/test_zenodo.py | 94 +++++++++++++++++++--- 4 files changed, 138 insertions(+), 20 deletions(-) diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py index 8d9ec367..03bc076b 100644 --- a/repo2docker/contentproviders/zenodo.py +++ b/repo2docker/contentproviders/zenodo.py @@ -1,15 +1,16 @@ import os import json import shutil +import copy from os import makedirs from os import path from urllib.request import build_opener, urlopen, Request from zipfile import ZipFile, is_zipfile -from idutils import normalize_doi, is_doi from .base import ContentProvider from ..utils import copytree +from ..utils import normalize_doi, is_doi from .. import __version__ @@ -40,6 +41,24 @@ class Zenodo(ContentProvider): else: return doi + def _getfromdict(self, datadict, dotpath): + # Use a dotpath (string separated by periods) + # to access vaules in a dictionary + # data.files.0 returns value at dataDict[data][files][0] + split = dotpath.split(".") + # We check if we have any digits and convert these to + # ints for list access + mapList = [] + for s in split: + if s.isdigit(): + mapList.append(int(s)) + else: + mapList.append(s) + values = copy.deepcopy(datadict) + for k in mapList: + values = values[k] + return values + def detect(self, doi, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Zenodo/Invenio record""" # We need the hostname (url where records are), api url (for metadata), @@ -51,7 +70,7 @@ class Zenodo(ContentProvider): "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"], "api": "https://zenodo.org/api/records/", "filepath": "files", - "filename": "files.key", + "filename": "filename", "download": "links.download", "type": "metadata.upload_type", }, @@ -61,8 +80,9 @@ class Zenodo(ContentProvider): "http://data.caltech.edu/records/", ], "api": "https://data.caltech.edu/api/record/", - "filepath": "files", - "filename": "electronic_location_and_access.electronic_name.0", + "filepath": "metadata.electronic_location_and_access", + "filename": "electronic_name.0", + "download": "uniform_resource_identifier", "type": "metadata.resourceType.resourceTypeGeneral", }, ] @@ -91,8 +111,8 @@ class Zenodo(ContentProvider): def _fetch(file_ref, unzip=False): # the assumption is that `unzip=True` means that this is the only # file related to the zenodo record - with self._urlopen(file_ref["links"]["download"]) as src: - fname = file_ref["filename"] + with self._urlopen(self._getfromdict(file_ref, host["download"])) as src: + fname = self._getfromdict(file_ref, host["filename"]) if path.dirname(fname): sub_dir = path.join(output_dir, path.dirname(fname)) if not path.exists(sub_dir): @@ -126,9 +146,13 @@ class Zenodo(ContentProvider): copytree(path.join(output_dir, d), output_dir) shutil.rmtree(path.join(output_dir, d)) - is_software = record["metadata"]["upload_type"] == "software" - only_one_file = len(record["files"]) == 1 - for file_ref in record["files"]: + is_software = self._getfromdict(record, host["type"]).lower() == "software" + files = self._getfromdict(record, host["filepath"]) + + # + + only_one_file = len(files) == 1 + for file_ref in files: for line in _fetch(file_ref, unzip=is_software and only_one_file): yield line diff --git a/repo2docker/utils.py b/repo2docker/utils.py index d9ebdc01..94f6be54 100644 --- a/repo2docker/utils.py +++ b/repo2docker/utils.py @@ -391,3 +391,24 @@ def copytree( if errors: raise Error(errors) return dst + + +# Code segments below from idutils (https://github.com/inveniosoftware/idutils) +# Copyright (C) 2015-2018 CERN. +# Copyright (C) 2018 Alan Rubin. +# Licensed under BSD-3-Clause license +doi_regexp = re.compile( + "(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$", flags=re.I +) +"""See http://en.wikipedia.org/wiki/Digital_object_identifier.""" + + +def is_doi(val): + """Test if argument is a DOI.""" + return doi_regexp.match(val) + + +def normalize_doi(val): + """Normalize a DOI.""" + m = doi_regexp.match(val) + return m.group(2) diff --git a/setup.py b/setup.py index 8b47c9a5..9b7ce522 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,6 @@ setup( "ruamel.yaml>=0.15", "toml", "semver", - "idutils", ], python_requires=">=3.5", author="Project Jupyter Contributors", diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py index a0d26584..596e9825 100644 --- a/tests/unit/contentproviders/test_zenodo.py +++ b/tests/unit/contentproviders/test_zenodo.py @@ -24,13 +24,47 @@ def test_detect(): with patch.object(Zenodo, "_urlopen") as fake_urlopen: fake_urlopen.return_value.url = "https://zenodo.org/record/3232985" # valid Zenodo DOIs trigger this content provider - assert Zenodo().detect("10.5281/zenodo.3232985") == {"record": "3232985"} - assert Zenodo().detect("https://doi.org/10.5281/zenodo.3232985") == { - "record": "3232985" + assert Zenodo().detect("10.5281/zenodo.3232985") == { + "host": { + "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"], + "api": "https://zenodo.org/api/records/", + "filepath": "files", + "filename": "filename", + "download": "links.download", + "type": "metadata.upload_type", + }, + "record": "3232985", } - assert Zenodo().detect("https://zenodo.org/record/3232985") == { - "record": "3232985" + assert ( + Zenodo().detect("https://doi.org/10.5281/zenodo.3232985")["record"] + == "3232985" + ) + assert ( + Zenodo().detect("https://zenodo.org/record/3232985")["record"] == "3232985" + ) + + # only two of the three calls above have to resolve a DOI + assert fake_urlopen.call_count == 2 + + with patch.object(Zenodo, "_urlopen") as fake_urlopen: + fake_urlopen.return_value.url = "https://data.caltech.edu/records/1235" + # valid CaltechDATA DOIs trigger this content provider + assert Zenodo().detect("10.22002/d1.1235") == { + "hots": { + "hostname": [ + "https://data.caltech.edu/records/", + "http://data.caltech.edu/records/", + ], + "api": "https://data.caltech.edu/api/record/", + "filepath": "metadata.electronic_location_and_access", + "filename": "electronic_name.0", + "download": "uniform_resource_identifier", + "type": "metadata.resourceType.resourceTypeGeneral", + }, + "record": "1235", } + assert Zenodo().detect("https://doi.org/10.22002/d1.1235")["record"] == "1235" + assert Zenodo().detect("https://zenodo.org/record/3232985")["record"] == "1235" # only two of the three calls above have to resolve a DOI assert fake_urlopen.call_count == 2 @@ -83,10 +117,24 @@ def test_fetch_software_from_github_archive(): with patch.object(Zenodo, "_urlopen", new=mock_urlopen): zen = Zenodo() + spec = { + "host": { + "hostname": [ + "https://zenodo.org/record/", + "http://zenodo.org/record/", + ], + "api": "https://zenodo.org/api/records/", + "filepath": "files", + "filename": "filename", + "download": "links.download", + "type": "metadata.upload_type", + }, + "record": "1234", + } with TemporaryDirectory() as d: output = [] - for l in zen.fetch({"record": "1234"}, d): + for l in zen.fetch(spec, d): output.append(l) unpacked_files = set(os.listdir(d)) @@ -123,9 +171,22 @@ def test_fetch_software(): with patch.object(Zenodo, "_urlopen", new=mock_urlopen): with TemporaryDirectory() as d: zen = Zenodo() - + spec = spec = { + "host": { + "hostname": [ + "https://zenodo.org/record/", + "http://zenodo.org/record/", + ], + "api": "https://zenodo.org/api/records/", + "filepath": "files", + "filename": "filename", + "download": "links.download", + "type": "metadata.upload_type", + }, + "record": "1234", + } output = [] - for l in zen.fetch({"record": "1234"}, d): + for l in zen.fetch(spec, d): output.append(l) unpacked_files = set(os.listdir(d)) @@ -164,9 +225,22 @@ def test_fetch_data(): with patch.object(Zenodo, "_urlopen", new=mock_urlopen): with TemporaryDirectory() as d: zen = Zenodo() - + spec = { + "host": { + "hostname": [ + "https://zenodo.org/record/", + "http://zenodo.org/record/", + ], + "api": "https://zenodo.org/api/records/", + "filepath": "files", + "filename": "filename", + "download": "links.download", + "type": "metadata.upload_type", + }, + "record": "1234", + } output = [] - for l in zen.fetch({"record": "1234"}, d): + for l in zen.fetch(spec, d): output.append(l) unpacked_files = set(os.listdir(d))