[MRG] Generalize Zenodo content provider to support other Invenio repositories (#704)

[MRG] Generalize Zenodo content provider to support other Invenio repositories
2019-06-21 09:48:52 +02:00 · 2019-06-21 09:48:52 +02:00 · 70819410f1
commit 70819410f1
--- a/repo2docker/contentproviders/zenodo.py
+++ b/repo2docker/contentproviders/zenodo.py
@ -5,10 +5,12 @@ import shutil
 from os import makedirs
 from os import path
 from urllib.request import build_opener, urlopen, Request
+from urllib.error import HTTPError
 from zipfile import ZipFile, is_zipfile

 from .base import ContentProvider
-from ..utils import copytree
+from ..utils import copytree, deep_get
+from ..utils import normalize_doi, is_doi
 from .. import __version__


@ -28,39 +30,64 @@ class Zenodo(ContentProvider):

        return urlopen(req)

+    def _doi2url(self, doi):
+        # Transform a DOI to a URL
+        # If not a doi, assume we have a URL and return
+        if is_doi(doi):
+            doi = normalize_doi(doi)
+
+            try:
+                resp = self._urlopen("https://doi.org/{}".format(doi))
+            # If the DOI doesn't resolve, just return URL
+            except HTTPError:
+                return doi
+            return resp.url
+        else:
+            # Just return what is actulally just a URL
+            return doi
+
    def detect(self, doi, ref=None, extra_args=None):
-        """Trigger this provider for things that resolve to a Zenodo record"""
-        # To support Zenodo instances not hosted at zenodo.org we need to
-        # start maintaining a list of known DOI prefixes and their hostname.
-        # We should also change to returning a complete `record_url` that
-        # fetch() can use instead of constructing a URL there
-        doi = doi.lower()
-        # 10.5281 is the Zenodo DOI prefix
-        if doi.startswith("10.5281/"):
-            resp = self._urlopen("https://doi.org/{}".format(doi))
-            self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
-            return {"record": self.record_id}
+        """Trigger this provider for things that resolve to a Zenodo/Invenio record"""
+        # We need the hostname (url where records are), api url (for metadata),
+        # filepath (path to files in metadata), filename (path to filename in
+        # metadata), download (path to file download URL), and type (path to item type in metadata)
+        hosts = [
+            {
+                "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
+                "api": "https://zenodo.org/api/records/",
+                "filepath": "files",
+                "filename": "filename",
+                "download": "links.download",
+                "type": "metadata.upload_type",
+            },
+            {
+                "hostname": [
+                    "https://data.caltech.edu/records/",
+                    "http://data.caltech.edu/records/",
+                ],
+                "api": "https://data.caltech.edu/api/record/",
+                "filepath": "metadata.electronic_location_and_access",
+                "filename": "electronic_name.0",
+                "download": "uniform_resource_identifier",
+                "type": "metadata.resourceType.resourceTypeGeneral",
+            },
+        ]

-        elif doi.startswith("https://doi.org/10.5281/") or doi.startswith(
-            "http://doi.org/10.5281/"
-        ):
-            resp = self._urlopen(doi)
-            self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
-            return {"record": self.record_id}
+        url = self._doi2url(doi)

-        elif doi.startswith("https://zenodo.org/record/") or doi.startswith(
-            "http://zenodo.org/record/"
-        ):
-            self.record_id = doi.rsplit("/", maxsplit=1)[1]
-            return {"record": self.record_id}
+        for host in hosts:
+            if any([url.startswith(s) for s in host["hostname"]]):
+                self.record_id = url.rsplit("/", maxsplit=1)[1]
+                return {"record": self.record_id, "host": host}

    def fetch(self, spec, output_dir, yield_output=False):
        """Fetch and unpack a Zenodo record"""
        record_id = spec["record"]
+        host = spec["host"]

        yield "Fetching Zenodo record {}.\n".format(record_id)
        req = Request(
-            "https://zenodo.org/api/records/{}".format(record_id),
+            "{}{}".format(host["api"], record_id),
            headers={"accept": "application/json"},
        )
        resp = self._urlopen(req)
@ -70,8 +97,8 @@ class Zenodo(ContentProvider):
        def _fetch(file_ref, unzip=False):
            # the assumption is that `unzip=True` means that this is the only
            # file related to the zenodo record
-            with self._urlopen(file_ref["links"]["download"]) as src:
-                fname = file_ref["filename"]
+            with self._urlopen(deep_get(file_ref, host["download"])) as src:
+                fname = deep_get(file_ref, host["filename"])
                if path.dirname(fname):
                    sub_dir = path.join(output_dir, path.dirname(fname))
                    if not path.exists(sub_dir):
@ -105,9 +132,10 @@ class Zenodo(ContentProvider):
                        copytree(path.join(output_dir, d), output_dir)
                        shutil.rmtree(path.join(output_dir, d))

-        is_software = record["metadata"]["upload_type"] == "software"
-        only_one_file = len(record["files"]) == 1
-        for file_ref in record["files"]:
+        is_software = deep_get(record, host["type"]).lower() == "software"
+        files = deep_get(record, host["filepath"])
+        only_one_file = len(files) == 1
+        for file_ref in files:
            for line in _fetch(file_ref, unzip=is_software and only_one_file):
                yield line

--- a/repo2docker/utils.py
+++ b/repo2docker/utils.py
@ -391,3 +391,45 @@ def copytree(
    if errors:
        raise Error(errors)
    return dst
+
+
+def deep_get(dikt, path):
+    """Get a value located in `path` from a nested dictionary.
+
+    Use a string separated by periods as the path to access
+    values in a nested dictionary:
+
+    deep_get(data, "data.files.0") == data["data"]["files"][0]
+    """
+    value = dikt
+    for component in path.split("."):
+        if component.isdigit():
+            value = value[int(component)]
+        else:
+            value = value[component]
+    return value
+
+
+# doi_regexp, is_doi, and normalize_doi are from idutils (https://github.com/inveniosoftware/idutils)
+# Copyright (C) 2015-2018 CERN.
+# Copyright (C) 2018 Alan Rubin.
+# Licensed under BSD-3-Clause license
+doi_regexp = re.compile(
+    "(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$", flags=re.I
+)
+
+
+def is_doi(val):
+    """Returns None if val doesn't match pattern of a DOI.
+    http://en.wikipedia.org/wiki/Digital_object_identifier."""
+    print(type(val))
+    print(val)
+    return doi_regexp.match(val)
+
+
+def normalize_doi(val):
+    """Return just the DOI (e.g. 10.1234/jshd123)
+    from a val that could include a url or doi 
+    (e.g. https://doi.org/10.1234/jshd123)"""
+    m = doi_regexp.match(val)
+    return m.group(2)
--- a/tests/unit/contentproviders/test_zenodo.py
+++ b/tests/unit/contentproviders/test_zenodo.py
@ -1,5 +1,6 @@
 import json
 import os
+import pytest

 from contextlib import contextmanager
 from io import BytesIO
@ -20,18 +21,57 @@ def test_content_id():
        assert zen.content_id == "3232985"


-def test_detect():
-    with patch.object(Zenodo, "_urlopen") as fake_urlopen:
-        fake_urlopen.return_value.url = "https://zenodo.org/record/3232985"
-        # valid Zenodo DOIs trigger this content provider
-        assert Zenodo().detect("10.5281/zenodo.3232985") == {"record": "3232985"}
-        assert Zenodo().detect("https://doi.org/10.5281/zenodo.3232985") == {
-            "record": "3232985"
-        }
-        assert Zenodo().detect("https://zenodo.org/record/3232985") == {
-            "record": "3232985"
-        }
+test_hosts = [
+    (
+        [
+            "https://zenodo.org/record/3232985",
+            "10.5281/zenodo.3232985",
+            "https://doi.org/10.5281/zenodo.3232985",
+        ],
+        {
+            "host": {
+                "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
+                "api": "https://zenodo.org/api/records/",
+                "filepath": "files",
+                "filename": "filename",
+                "download": "links.download",
+                "type": "metadata.upload_type",
+            },
+            "record": "3232985",
+        },
+    ),
+    (
+        [
+            "https://data.caltech.edu/records/1235",
+            "10.22002/d1.1235",
+            "https://doi.org/10.22002/d1.1235",
+        ],
+        {
+            "host": {
+                "hostname": [
+                    "https://data.caltech.edu/records/",
+                    "http://data.caltech.edu/records/",
+                ],
+                "api": "https://data.caltech.edu/api/record/",
+                "filepath": "metadata.electronic_location_and_access",
+                "filename": "electronic_name.0",
+                "download": "uniform_resource_identifier",
+                "type": "metadata.resourceType.resourceTypeGeneral",
+            },
+            "record": "1235",
+        },
+    ),
+]

+
+@pytest.mark.parametrize("test_input,expected", test_hosts)
+def test_detect_zenodo(test_input, expected):
+    with patch.object(Zenodo, "_urlopen") as fake_urlopen:
+        fake_urlopen.return_value.url = test_input[0]
+        # valid Zenodo DOIs trigger this content provider
+        assert Zenodo().detect(test_input[0]) == expected
+        assert Zenodo().detect(test_input[1]) == expected
+        assert Zenodo().detect(test_input[2]) == expected
        # only two of the three calls above have to resolve a DOI
        assert fake_urlopen.call_count == 2

@ -39,13 +79,12 @@ def test_detect():
        # Don't trigger the Zenodo content provider
        assert Zenodo().detect("/some/path/here") is None
        assert Zenodo().detect("https://example.com/path/here") is None
-        # donn't handle DOIs that aren't from Zenodo
+        # don't handle DOIs that aren't from Zenodo
+        fake_urlopen.return_value.url = (
+            "http://joss.theoj.org/papers/10.21105/joss.01277"
+        )
        assert Zenodo().detect("https://doi.org/10.21105/joss.01277") is None

-        # none of the examples are Zenodo like, so we should not attempt to
-        # resolve a DOI either
-        assert not fake_urlopen.called
-

@contextmanager
 def zenodo_archive(prefix="a_directory"):
@ -83,10 +122,24 @@ def test_fetch_software_from_github_archive():

        with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
            zen = Zenodo()
+            spec = {
+                "host": {
+                    "hostname": [
+                        "https://zenodo.org/record/",
+                        "http://zenodo.org/record/",
+                    ],
+                    "api": "https://zenodo.org/api/records/",
+                    "filepath": "files",
+                    "filename": "filename",
+                    "download": "links.download",
+                    "type": "metadata.upload_type",
+                },
+                "record": "1234",
+            }

            with TemporaryDirectory() as d:
                output = []
-                for l in zen.fetch({"record": "1234"}, d):
+                for l in zen.fetch(spec, d):
                    output.append(l)

                unpacked_files = set(os.listdir(d))
@ -123,9 +176,22 @@ def test_fetch_software():
        with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
            with TemporaryDirectory() as d:
                zen = Zenodo()
-
+                spec = spec = {
+                    "host": {
+                        "hostname": [
+                            "https://zenodo.org/record/",
+                            "http://zenodo.org/record/",
+                        ],
+                        "api": "https://zenodo.org/api/records/",
+                        "filepath": "files",
+                        "filename": "filename",
+                        "download": "links.download",
+                        "type": "metadata.upload_type",
+                    },
+                    "record": "1234",
+                }
                output = []
-                for l in zen.fetch({"record": "1234"}, d):
+                for l in zen.fetch(spec, d):
                    output.append(l)

                unpacked_files = set(os.listdir(d))
@ -164,9 +230,22 @@ def test_fetch_data():
            with patch.object(Zenodo, "_urlopen", new=mock_urlopen):
                with TemporaryDirectory() as d:
                    zen = Zenodo()
-
+                    spec = {
+                        "host": {
+                            "hostname": [
+                                "https://zenodo.org/record/",
+                                "http://zenodo.org/record/",
+                            ],
+                            "api": "https://zenodo.org/api/records/",
+                            "filepath": "files",
+                            "filename": "filename",
+                            "download": "links.download",
+                            "type": "metadata.upload_type",
+                        },
+                        "record": "1234",
+                    }
                    output = []
-                    for l in zen.fetch({"record": "1234"}, d):
+                    for l in zen.fetch(spec, d):
                        output.append(l)

                    unpacked_files = set(os.listdir(d))
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@ -82,3 +82,31 @@ def test_invalid_port_mapping(port_spec):
        utils.validate_and_generate_port_mapping([port_spec])

    assert 'Port specification "{}"'.format(port_spec) in str(e.value)
+
+
+def test_deep_get():
+    data = {"data": {"files": [1, 2, 3]}}
+    assert utils.deep_get(data, "data.files.0") == 1
+    assert utils.deep_get(data, "data.files.1") == 2
+    assert utils.deep_get(data, "data.files") == [1, 2, 3]
+    assert utils.deep_get(data, "data") == {"files": [1, 2, 3]}
+
+
+def test_is_doi():
+    assert utils.is_doi("10.1234/jshd123") != None
+    assert utils.is_doi("10.1234/JSHD.8192") != None
+    assert utils.is_doi("doi.org/10.1234/jshd123") != None
+    assert utils.is_doi("http://doi.org/10.1234/jshd123") != None
+    assert utils.is_doi("https://doi.org/10.1234/jshd123") != None
+    assert utils.is_doi("http://dx.doi.org/10.1234/jshd123") != None
+    assert utils.is_doi("101234/jshd123") == None
+    assert utils.is_doi("https://mybinder.org") == None
+
+
+def test_normalize_doi():
+    assert utils.normalize_doi("10.1234/jshd123") == "10.1234/jshd123"
+    assert utils.normalize_doi("10.1234/JSHD.8192") == "10.1234/JSHD.8192"
+    assert utils.normalize_doi("doi.org/10.1234/jshd123") == "10.1234/jshd123"
+    assert utils.normalize_doi("http://doi.org/10.1234/jshd123") == "10.1234/jshd123"
+    assert utils.normalize_doi("https://doi.org/10.1234/jshd123") == "10.1234/jshd123"
+    assert utils.normalize_doi("http://dx.doi.org/10.1234/jshd123") == "10.1234/jshd123"