Add basic Zenodo content provider

2019-05-27 17:32:03 +02:00 · 2019-05-27 17:32:03 +02:00 · dce6c1e8d7
commit dce6c1e8d7
--- a/repo2docker/app.py
+++ b/repo2docker/app.py
@ -136,6 +136,7 @@ class Repo2Docker(Application):
    content_providers = List(
        [
            contentproviders.Local,
+            contentproviders.Zenodo,
            contentproviders.Git,
        ],
        config=True,
--- a/repo2docker/contentproviders/init.py
+++ b/repo2docker/contentproviders/init.py
@ -1,2 +1,3 @@
 from .git import Git
 from .base import Local
+from .zenodo import Zenodo
--- a/repo2docker/contentproviders/zenodo.py
+++ b/repo2docker/contentproviders/zenodo.py
@ -0,0 +1,73 @@
+import json
+import shutil
+
+from os import makedirs
+from os import path
+from urllib.request import urlopen, Request
+from zipfile import ZipFile, is_zipfile
+
+from .base import ContentProvider
+from ..utils import copytree
+
+
+class Zenodo(ContentProvider):
+    """Provide contents of a Zenodo deposit."""
+
+    def detect(self, doi, ref=None, extra_args=None):
+        # 10.5281 is the Zenodo DOI prefix
+        if doi.startswith('10.5281'):
+            resp = urlopen("https://doi.org/{}".format(doi))
+            self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
+            return {'record': self.record_id}
+
+    def fetch(self, spec, output_dir, yield_output=False):
+        record_id = spec['record']
+
+        yield "Fetching Zenodo record {}.\n".format(record_id)
+        req = Request("https://zenodo.org/api/records/{}".format(record_id),
+                      headers={"accept": "application/json"})
+        resp = urlopen(req)
+
+        record = json.loads(resp.read().decode("utf-8"))
+
+        def _fetch(file_ref, unzip=False):
+            with urlopen(file_ref["links"]["download"]) as src:
+                fname = file_ref["filename"]
+                sub_dir = path.join(output_dir, path.dirname(fname))
+                if not path.exists(sub_dir):
+                    print("Creating", sub_dir)
+                    makedirs(sub_dir, exist_ok=True)
+
+                dst_fname = path.join(output_dir, fname)
+                with open(dst_fname, "wb") as dst:
+                    yield "Fetching {}\n".format(fname)
+                    shutil.copyfileobj(src, dst)
+
+                # first close the newly written file, then continue
+                # processing it
+                if unzip and is_zipfile(dst_fname):
+                    zfile = ZipFile(dst_fname)
+                    zfile.extractall(path=output_dir)
+                    zfile.close()
+                    import os
+                    d = os.listdir(output_dir)[0]
+                    print(output_dir)
+                    print(os.listdir(output_dir))
+                    copytree(path.join(output_dir, d), output_dir)
+                    shutil.rmtree(sub_dir)
+                    shutil.rmtree(path.join(output_dir, d))
+
+        is_software = record["metadata"]["upload_type"] == "software"
+        only_one_file = len(record["files"]) == 1
+        for file_ref in record['files']:
+            for line in _fetch(file_ref, unzip=is_software and only_one_file):
+                yield line
+
+        import pdb; pdb.set_trace()
+
+    @property
+    def content_id(self):
+        """A unique ID to represent the version of the content.
+        Uses the first seven characters of the git commit ID of the repository.
+        """
+        return self.record_id
--- a/repo2docker/utils.py
+++ b/repo2docker/utils.py
@ -4,6 +4,8 @@ import os
 import re
 import subprocess

+from shutil import copystat, copy2
+
 from traitlets import Integer, TraitError


@ -287,3 +289,91 @@ def check_ref(ref, cwd=None):
            # We'll throw an error later if no refs resolve
            pass
    return hash
+
+
+class Error(OSError):
+    pass
+
+
+# a copy of shutil.copytree() that is ok with the target directory
+# already existing
+def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
+             ignore_dangling_symlinks=False):
+    """Recursively copy a directory tree.
+    The destination directory must not already exist.
+    If exception(s) occur, an Error is raised with a list of reasons.
+    If the optional symlinks flag is true, symbolic links in the
+    source tree result in symbolic links in the destination tree; if
+    it is false, the contents of the files pointed to by symbolic
+    links are copied. If the file pointed by the symlink doesn't
+    exist, an exception will be added in the list of errors raised in
+    an Error exception at the end of the copy process.
+    You can set the optional ignore_dangling_symlinks flag to true if you
+    want to silence this exception. Notice that this has no effect on
+    platforms that don't support os.symlink.
+    The optional ignore argument is a callable. If given, it
+    is called with the `src` parameter, which is the directory
+    being visited by copytree(), and `names` which is the list of
+    `src` contents, as returned by os.listdir():
+        callable(src, names) -> ignored_names
+    Since copytree() is called recursively, the callable will be
+    called once for each directory that is copied. It returns a
+    list of names relative to the `src` directory that should
+    not be copied.
+    The optional copy_function argument is a callable that will be used
+    to copy each file. It will be called with the source path and the
+    destination path as arguments. By default, copy2() is used, but any
+    function that supports the same signature (like copy()) can be used.
+    """
+    names = os.listdir(src)
+    if ignore is not None:
+        ignored_names = ignore(src, names)
+    else:
+        ignored_names = set()
+
+    os.makedirs(dst, exist_ok=True)
+    errors = []
+    for name in names:
+        if name in ignored_names:
+            continue
+        srcname = os.path.join(src, name)
+        dstname = os.path.join(dst, name)
+        try:
+            if os.path.islink(srcname):
+                linkto = os.readlink(srcname)
+                if symlinks:
+                    # We can't just leave it to `copy_function` because legacy
+                    # code with a custom `copy_function` may rely on copytree
+                    # doing the right thing.
+                    os.symlink(linkto, dstname)
+                    copystat(srcname, dstname, follow_symlinks=not symlinks)
+                else:
+                    # ignore dangling symlink if the flag is on
+                    if not os.path.exists(linkto) and ignore_dangling_symlinks:
+                        continue
+                    # otherwise let the copy occurs. copy2 will raise an error
+                    if os.path.isdir(srcname):
+                        copytree(srcname, dstname, symlinks, ignore,
+                                 copy_function)
+                    else:
+                        copy_function(srcname, dstname)
+            elif os.path.isdir(srcname):
+                copytree(srcname, dstname, symlinks, ignore, copy_function)
+            else:
+                # Will raise a SpecialFileError for unsupported file types
+                copy_function(srcname, dstname)
+        # catch the Error from the recursive copytree so that we can
+        # continue with other files
+        except Error as err:
+            errors.extend(err.args[0])
+        except OSError as why:
+            errors.append((srcname, dstname, str(why)))
+    try:
+        copystat(src, dst)
+    except OSError as why:
+        # Copying file access times may fail on Windows
+        if getattr(why, 'winerror', None) is None:
+            errors.append((src, dst, str(why)))
+    if errors:
+        raise Error(errors)
+    return dst