Add basic Zenodo content provider

2019-05-27 17:32:03 +02:00 · 2019-05-27 17:32:03 +02:00 · dce6c1e8d7
commit dce6c1e8d7
--- a/repo2docker/app.py
+++ b/repo2docker/app.py
@ -136,6 +136,7 @@ class Repo2Docker(Application):
    content_providers = List(
        [
            contentproviders.Local,
            contentproviders.Zenodo,
            contentproviders.Git,
        ],
        config=True,
--- a/repo2docker/contentproviders/init.py
+++ b/repo2docker/contentproviders/init.py
@ -1,2 +1,3 @@
 from .git import Git
 from .base import Local
 from .zenodo import Zenodo
--- a/repo2docker/contentproviders/zenodo.py
+++ b/repo2docker/contentproviders/zenodo.py
@ -0,0 +1,73 @@
 import json
 import shutil
 from os import makedirs
 from os import path
 from urllib.request import urlopen, Request
 from zipfile import ZipFile, is_zipfile
 from .base import ContentProvider
 from ..utils import copytree
 class Zenodo(ContentProvider):
    """Provide contents of a Zenodo deposit."""
    def detect(self, doi, ref=None, extra_args=None):
        # 10.5281 is the Zenodo DOI prefix
        if doi.startswith('10.5281'):
            resp = urlopen("https://doi.org/{}".format(doi))
            self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
            return {'record': self.record_id}
    def fetch(self, spec, output_dir, yield_output=False):
        record_id = spec['record']
        yield "Fetching Zenodo record {}.\n".format(record_id)
        req = Request("https://zenodo.org/api/records/{}".format(record_id),
                      headers={"accept": "application/json"})
        resp = urlopen(req)
        record = json.loads(resp.read().decode("utf-8"))
        def _fetch(file_ref, unzip=False):
            with urlopen(file_ref["links"]["download"]) as src:
                fname = file_ref["filename"]
                sub_dir = path.join(output_dir, path.dirname(fname))
                if not path.exists(sub_dir):
                    print("Creating", sub_dir)
                    makedirs(sub_dir, exist_ok=True)
                dst_fname = path.join(output_dir, fname)
                with open(dst_fname, "wb") as dst:
                    yield "Fetching {}\n".format(fname)
                    shutil.copyfileobj(src, dst)
                # first close the newly written file, then continue
                # processing it
                if unzip and is_zipfile(dst_fname):
                    zfile = ZipFile(dst_fname)
                    zfile.extractall(path=output_dir)
                    zfile.close()
                    import os
                    d = os.listdir(output_dir)[0]
                    print(output_dir)
                    print(os.listdir(output_dir))
                    copytree(path.join(output_dir, d), output_dir)
                    shutil.rmtree(sub_dir)
                    shutil.rmtree(path.join(output_dir, d))
        is_software = record["metadata"]["upload_type"] == "software"
        only_one_file = len(record["files"]) == 1
        for file_ref in record['files']:
            for line in _fetch(file_ref, unzip=is_software and only_one_file):
                yield line
        import pdb; pdb.set_trace()
    @property
    def content_id(self):
        """A unique ID to represent the version of the content.
        Uses the first seven characters of the git commit ID of the repository.
        """
        return self.record_id
--- a/repo2docker/utils.py
+++ b/repo2docker/utils.py
@ -4,6 +4,8 @@ import os
 import re
 import subprocess
 from shutil import copystat, copy2
 from traitlets import Integer, TraitError
@ -287,3 +289,91 @@ def check_ref(ref, cwd=None):
            # We'll throw an error later if no refs resolve
            pass
    return hash
 class Error(OSError):
    pass
 # a copy of shutil.copytree() that is ok with the target directory
 # already existing
 def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
             ignore_dangling_symlinks=False):
    """Recursively copy a directory tree.
    The destination directory must not already exist.
    If exception(s) occur, an Error is raised with a list of reasons.
    If the optional symlinks flag is true, symbolic links in the
    source tree result in symbolic links in the destination tree; if
    it is false, the contents of the files pointed to by symbolic
    links are copied. If the file pointed by the symlink doesn't
    exist, an exception will be added in the list of errors raised in
    an Error exception at the end of the copy process.
    You can set the optional ignore_dangling_symlinks flag to true if you
    want to silence this exception. Notice that this has no effect on
    platforms that don't support os.symlink.
    The optional ignore argument is a callable. If given, it
    is called with the `src` parameter, which is the directory
    being visited by copytree(), and `names` which is the list of
    `src` contents, as returned by os.listdir():
        callable(src, names) -> ignored_names
    Since copytree() is called recursively, the callable will be
    called once for each directory that is copied. It returns a
    list of names relative to the `src` directory that should
    not be copied.
    The optional copy_function argument is a callable that will be used
    to copy each file. It will be called with the source path and the
    destination path as arguments. By default, copy2() is used, but any
    function that supports the same signature (like copy()) can be used.
    """
    names = os.listdir(src)
    if ignore is not None:
        ignored_names = ignore(src, names)
    else:
        ignored_names = set()
    os.makedirs(dst, exist_ok=True)
    errors = []
    for name in names:
        if name in ignored_names:
            continue
        srcname = os.path.join(src, name)
        dstname = os.path.join(dst, name)
        try:
            if os.path.islink(srcname):
                linkto = os.readlink(srcname)
                if symlinks:
                    # We can't just leave it to `copy_function` because legacy
                    # code with a custom `copy_function` may rely on copytree
                    # doing the right thing.
                    os.symlink(linkto, dstname)
                    copystat(srcname, dstname, follow_symlinks=not symlinks)
                else:
                    # ignore dangling symlink if the flag is on
                    if not os.path.exists(linkto) and ignore_dangling_symlinks:
                        continue
                    # otherwise let the copy occurs. copy2 will raise an error
                    if os.path.isdir(srcname):
                        copytree(srcname, dstname, symlinks, ignore,
                                 copy_function)
                    else:
                        copy_function(srcname, dstname)
            elif os.path.isdir(srcname):
                copytree(srcname, dstname, symlinks, ignore, copy_function)
            else:
                # Will raise a SpecialFileError for unsupported file types
                copy_function(srcname, dstname)
        # catch the Error from the recursive copytree so that we can
        # continue with other files
        except Error as err:
            errors.extend(err.args[0])
        except OSError as why:
            errors.append((srcname, dstname, str(why)))
    try:
        copystat(src, dst)
    except OSError as why:
        # Copying file access times may fail on Windows
        if getattr(why, 'winerror', None) is None:
            errors.append((src, dst, str(why)))
    if errors:
        raise Error(errors)
    return dst