From 87f33e3b6ae229dbfcea8e09abca450776c506de Mon Sep 17 00:00:00 2001
From: Tom Morrell <tmorrell@caltech.edu>
Date: Tue, 18 Jun 2019 15:11:06 -0700
Subject: [PATCH] Full generalization and support for CaltechDATA

---
 repo2docker/contentproviders/zenodo.py     | 42 +++++++---
 repo2docker/utils.py                       | 21 +++++
 setup.py                                   |  1 -
 tests/unit/contentproviders/test_zenodo.py | 94 +++++++++++++++++++---
 4 files changed, 138 insertions(+), 20 deletions(-)

diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py
index 8d9ec367..03bc076b 100644
--- a/repo2docker/contentproviders/zenodo.py
+++ b/repo2docker/contentproviders/zenodo.py
@@ -1,15 +1,16 @@
 import os
 import json
 import shutil
+import copy
 
 from os import makedirs
 from os import path
 from urllib.request import build_opener, urlopen, Request
 from zipfile import ZipFile, is_zipfile
-from idutils import normalize_doi, is_doi
 
 from .base import ContentProvider
 from ..utils import copytree
+from ..utils import normalize_doi, is_doi
 from .. import __version__
 
 
@@ -40,6 +41,24 @@ class Zenodo(ContentProvider):
         else:
             return doi
 
+    def _getfromdict(self, datadict, dotpath):
+        # Use a dotpath (string separated by periods)
+        # to access vaules in a dictionary
+        # data.files.0 returns value at dataDict[data][files][0]
+        split = dotpath.split(".")
+        # We check if we have any digits and convert these to
+        # ints for list access
+        mapList = []
+        for s in split:
+            if s.isdigit():
+                mapList.append(int(s))
+            else:
+                mapList.append(s)
+        values = copy.deepcopy(datadict)
+        for k in mapList:
+            values = values[k]
+        return values
+
     def detect(self, doi, ref=None, extra_args=None):
         """Trigger this provider for things that resolve to a Zenodo/Invenio record"""
         # We need the hostname (url where records are), api url (for metadata),
@@ -51,7 +70,7 @@ class Zenodo(ContentProvider):
                 "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
                 "api": "https://zenodo.org/api/records/",
                 "filepath": "files",
-                "filename": "files.key",
+                "filename": "filename",
                 "download": "links.download",
                 "type": "metadata.upload_type",
             },
@@ -61,8 +80,9 @@ class Zenodo(ContentProvider):
                     "http://data.caltech.edu/records/",
                 ],
                 "api": "https://data.caltech.edu/api/record/",
-                "filepath": "files",
-                "filename": "electronic_location_and_access.electronic_name.0",
+                "filepath": "metadata.electronic_location_and_access",
+                "filename": "electronic_name.0",
+                "download": "uniform_resource_identifier",
                 "type": "metadata.resourceType.resourceTypeGeneral",
             },
         ]
@@ -91,8 +111,8 @@ class Zenodo(ContentProvider):
         def _fetch(file_ref, unzip=False):
             # the assumption is that `unzip=True` means that this is the only
             # file related to the zenodo record
-            with self._urlopen(file_ref["links"]["download"]) as src:
-                fname = file_ref["filename"]
+            with self._urlopen(self._getfromdict(file_ref, host["download"])) as src:
+                fname = self._getfromdict(file_ref, host["filename"])
                 if path.dirname(fname):
                     sub_dir = path.join(output_dir, path.dirname(fname))
                     if not path.exists(sub_dir):
@@ -126,9 +146,13 @@ class Zenodo(ContentProvider):
                         copytree(path.join(output_dir, d), output_dir)
                         shutil.rmtree(path.join(output_dir, d))
 
-        is_software = record["metadata"]["upload_type"] == "software"
-        only_one_file = len(record["files"]) == 1
-        for file_ref in record["files"]:
+        is_software = self._getfromdict(record, host["type"]).lower() == "software"
+        files = self._getfromdict(record, host["filepath"])
+
+        #
+
+        only_one_file = len(files) == 1
+        for file_ref in files:
             for line in _fetch(file_ref, unzip=is_software and only_one_file):
                 yield line
 
diff --git a/repo2docker/utils.py b/repo2docker/utils.py
index d9ebdc01..94f6be54 100644
--- a/repo2docker/utils.py
+++ b/repo2docker/utils.py
@@ -391,3 +391,24 @@ def copytree(
     if errors:
         raise Error(errors)
     return dst
+
+
+# Code segments below from idutils (https://github.com/inveniosoftware/idutils)
+# Copyright (C) 2015-2018 CERN.
+# Copyright (C) 2018 Alan Rubin.
+# Licensed under BSD-3-Clause license
+doi_regexp = re.compile(
+    "(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$", flags=re.I
+)
+"""See http://en.wikipedia.org/wiki/Digital_object_identifier."""
+
+
+def is_doi(val):
+    """Test if argument is a DOI."""
+    return doi_regexp.match(val)
+
+
+def normalize_doi(val):
+    """Normalize a DOI."""
+    m = doi_regexp.match(val)
+    return m.group(2)
diff --git a/setup.py b/setup.py
index 8b47c9a5..9b7ce522 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,6 @@ setup(
         "ruamel.yaml>=0.15",
         "toml",
         "semver",
-        "idutils",
     ],
     python_requires=">=3.5",
     author="Project Jupyter Contributors",
diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py
index a0d26584..596e9825 100644
--- a/tests/unit/contentproviders/test_zenodo.py
+++ b/tests/unit/contentproviders/test_zenodo.py
@@ -24,13 +24,47 @@ def test_detect():
     with patch.object(Zenodo, "_urlopen") as fake_urlopen:
         fake_urlopen.return_value.url = "https://zenodo.org/record/3232985"
         # valid Zenodo DOIs trigger this content provider
-        assert Zenodo().detect("10.5281/zenodo.3232985") == {"record": "3232985"}
-        assert Zenodo().detect("https://doi.org/10.5281/zenodo.3232985") == {
-            "record": "3232985"
+        assert Zenodo().detect("10.5281/zenodo.3232985") == {
+            "host": {
+                "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
+                "api": "https://zenodo.org/api/records/",
+                "filepath": "files",
+                "filename": "filename",
+                "download": "links.download",
+                "type": "metadata.upload_type",
+            },
+            "record": "3232985",
         }
-        assert Zenodo().detect("https://zenodo.org/record/3232985") == {
-            "record": "3232985"
+        assert (
+            Zenodo().detect("https://doi.org/10.5281/zenodo.3232985")["record"]
+            == "3232985"
+        )
+        assert (
+            Zenodo().detect("https://zenodo.org/record/3232985")["record"] == "3232985"
+        )
+
+        # only two of the three calls above have to resolve a DOI
+        assert fake_urlopen.call_count == 2
+
+    with patch.object(Zenodo, "_urlopen") as fake_urlopen:
+        fake_urlopen.return_value.url = "https://data.caltech.edu/records/1235"
+        # valid CaltechDATA DOIs trigger this content provider
+        assert Zenodo().detect("10.22002/d1.1235") == {
+            "hots": {
+                "hostname": [
+                    "https://data.caltech.edu/records/",
+                    "http://data.caltech.edu/records/",
+                ],
+                "api": "https://data.caltech.edu/api/record/",
+                "filepath": "metadata.electronic_location_and_access",
+                "filename": "electronic_name.0",
+                "download": "uniform_resource_identifier",
+                "type": "metadata.resourceType.resourceTypeGeneral",
+            },
+            "record": "1235",
         }
+        assert Zenodo().detect("https://doi.org/10.22002/d1.1235")["record"] == "1235"
+        assert Zenodo().detect("https://zenodo.org/record/3232985")["record"] == "1235"
 
         # only two of the three calls above have to resolve a DOI
         assert fake_urlopen.call_count == 2
@@ -83,10 +117,24 @@ def test_fetch_software_from_github_archive():
 
         with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
             zen = Zenodo()
+            spec = {
+                "host": {
+                    "hostname": [
+                        "https://zenodo.org/record/",
+                        "http://zenodo.org/record/",
+                    ],
+                    "api": "https://zenodo.org/api/records/",
+                    "filepath": "files",
+                    "filename": "filename",
+                    "download": "links.download",
+                    "type": "metadata.upload_type",
+                },
+                "record": "1234",
+            }
 
             with TemporaryDirectory() as d:
                 output = []
-                for l in zen.fetch({"record": "1234"}, d):
+                for l in zen.fetch(spec, d):
                     output.append(l)
 
                 unpacked_files = set(os.listdir(d))
@@ -123,9 +171,22 @@ def test_fetch_software():
         with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
             with TemporaryDirectory() as d:
                 zen = Zenodo()
-
+                spec = spec = {
+                    "host": {
+                        "hostname": [
+                            "https://zenodo.org/record/",
+                            "http://zenodo.org/record/",
+                        ],
+                        "api": "https://zenodo.org/api/records/",
+                        "filepath": "files",
+                        "filename": "filename",
+                        "download": "links.download",
+                        "type": "metadata.upload_type",
+                    },
+                    "record": "1234",
+                }
                 output = []
-                for l in zen.fetch({"record": "1234"}, d):
+                for l in zen.fetch(spec, d):
                     output.append(l)
 
                 unpacked_files = set(os.listdir(d))
@@ -164,9 +225,22 @@ def test_fetch_data():
             with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
                 with TemporaryDirectory() as d:
                     zen = Zenodo()
-
+                    spec = {
+                        "host": {
+                            "hostname": [
+                                "https://zenodo.org/record/",
+                                "http://zenodo.org/record/",
+                            ],
+                            "api": "https://zenodo.org/api/records/",
+                            "filepath": "files",
+                            "filename": "filename",
+                            "download": "links.download",
+                            "type": "metadata.upload_type",
+                        },
+                        "record": "1234",
+                    }
                     output = []
-                    for l in zen.fetch({"record": "1234"}, d):
+                    for l in zen.fetch(spec, d):
                         output.append(l)
 
                     unpacked_files = set(os.listdir(d))