repo2docker/repo2docker/contentproviders/zenodo.py

import os
import json
import shutil

from os import makedirs
from os import path
from urllib.request import build_opener, urlopen, Request
from zipfile import ZipFile, is_zipfile

from .base import ContentProvider
from ..utils import copytree
from .. import __version__


class Zenodo(ContentProvider):
    """Provide contents of a Zenodo deposit."""

    def _urlopen(self, req, headers=None):
        """A urlopen() helper"""
        # someone passed a string, not a request
        if not isinstance(req, Request):
            req = Request(req)

        req.add_header("User-Agent", "repo2docker {}".format(__version__))
        if headers is not None:
            for key, value in headers.items():
                req.add_header(key, value)

        return urlopen(req)

    def detect(self, doi, ref=None, extra_args=None):
        """Trigger this provider for things that resolve to a Zenodo record"""
        # To support Zenodo instances not hosted at zenodo.org we need to
        # start maintaining a list of known DOI prefixes and their hostname.
        # We should also change to returning a complete `record_url` that
        # fetch() can use instead of constructing a URL there
        doi = doi.lower()
        # 10.5281 is the Zenodo DOI prefix
        if doi.startswith("10.5281/"):
            resp = self._urlopen("https://doi.org/{}".format(doi))
            self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
            return {"record": self.record_id}

        elif doi.startswith("https://doi.org/10.5281/") or doi.startswith(
            "http://doi.org/10.5281/"
        ):
            resp = self._urlopen(doi)
            self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
            return {"record": self.record_id}

        elif doi.startswith("https://zenodo.org/record/") or doi.startswith(
            "http://zenodo.org/record/"
        ):
            self.record_id = doi.rsplit("/", maxsplit=1)[1]
            return {"record": self.record_id}

    def fetch(self, spec, output_dir, yield_output=False):
        """Fetch and unpack a Zenodo record"""
        record_id = spec["record"]

        yield "Fetching Zenodo record {}.\n".format(record_id)
        req = Request(
            "https://zenodo.org/api/records/{}".format(record_id),
            headers={"accept": "application/json"},
        )
        resp = self._urlopen(req)

        record = json.loads(resp.read().decode("utf-8"))

        def _fetch(file_ref, unzip=False):
            # the assumption is that `unzip=True` means that this is the only
            # file related to the zenodo record
            with self._urlopen(file_ref["links"]["download"]) as src:
                fname = file_ref["filename"]
                if path.dirname(fname):
                    sub_dir = path.join(output_dir, path.dirname(fname))
                    if not path.exists(sub_dir):
                        yield "Creating {}\n".format(sub_dir)
                        makedirs(sub_dir, exist_ok=True)

                dst_fname = path.join(output_dir, fname)
                with open(dst_fname, "wb") as dst:
                    yield "Fetching {}\n".format(fname)
                    shutil.copyfileobj(src, dst)
                # first close the newly written file, then continue
                # processing it
                if unzip and is_zipfile(dst_fname):
                    yield "Extracting {}\n".format(fname)
                    zfile = ZipFile(dst_fname)
                    zfile.extractall(path=output_dir)
                    zfile.close()

                    # delete downloaded file ...
                    os.remove(dst_fname)
                    # ... and any directories we might have created,
                    # in which case sub_dir will be defined
                    if path.dirname(fname):
                        shutil.rmtree(sub_dir)

                    new_subdirs = os.listdir(output_dir)
                    # if there is only one new subdirectory move its contents
                    # to the top level directory
                    if len(new_subdirs) == 1:
                        d = new_subdirs[0]
                        copytree(path.join(output_dir, d), output_dir)
                        shutil.rmtree(path.join(output_dir, d))

        is_software = record["metadata"]["upload_type"] == "software"
        only_one_file = len(record["files"]) == 1
        for file_ref in record["files"]:
            for line in _fetch(file_ref, unzip=is_software and only_one_file):
                yield line

    @property
    def content_id(self):
        """The Zenodo record ID as the content of a record is immutable"""
        return self.record_id
Better handling of software archives Unpack a single ZIP file, then move it to the root of the directory 2019-05-27 17:53:20 +00:00			`import os`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00			`import json`
			`import shutil`

			`from os import makedirs`
			`from os import path`
Add a urlopen helper to Zenodo content provider Use a helper function to inject a default user-agent header into every request we make. 2019-05-29 06:17:22 +00:00			`from urllib.request import build_opener, urlopen, Request`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00			`from zipfile import ZipFile, is_zipfile`

			`from .base import ContentProvider`
			`from ..utils import copytree`
Add a urlopen helper to Zenodo content provider Use a helper function to inject a default user-agent header into every request we make. 2019-05-29 06:17:22 +00:00			`from .. import __version__`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00

			`class Zenodo(ContentProvider):`
			`"""Provide contents of a Zenodo deposit."""`

Add a urlopen helper to Zenodo content provider Use a helper function to inject a default user-agent header into every request we make. 2019-05-29 06:17:22 +00:00			`def _urlopen(self, req, headers=None):`
			`"""A urlopen() helper"""`
			`# someone passed a string, not a request`
			`if not isinstance(req, Request):`
			`req = Request(req)`

			`req.add_header("User-Agent", "repo2docker {}".format(__version__))`
			`if headers is not None:`
			`for key, value in headers.items():`
			`req.add_header(key, value)`

			`return urlopen(req)`

Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00			`def detect(self, doi, ref=None, extra_args=None):`
Update docstrings in Zenodo provider 2019-05-28 17:28:05 +00:00			`"""Trigger this provider for things that resolve to a Zenodo record"""`
Add comment about supporting Zenodo instances not at zenodo.org 2019-05-29 05:10:35 +00:00			`# To support Zenodo instances not hosted at zenodo.org we need to`
			`# start maintaining a list of known DOI prefixes and their hostname.`
			# We should also change to returning a complete `record_url` that
			`# fetch() can use instead of constructing a URL there`
Add tests for Zenodo content provider 2019-05-28 17:10:32 +00:00			`doi = doi.lower()`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00			`# 10.5281 is the Zenodo DOI prefix`
Add tests for Zenodo content provider 2019-05-28 17:10:32 +00:00			`if doi.startswith("10.5281/"):`
Add a urlopen helper to Zenodo content provider Use a helper function to inject a default user-agent header into every request we make. 2019-05-29 06:17:22 +00:00			`resp = self._urlopen("https://doi.org/{}".format(doi))`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00			`self.record_id = resp.url.rsplit("/", maxsplit=1)[1]`
Add tests for Zenodo content provider 2019-05-28 17:10:32 +00:00			`return {"record": self.record_id}`

			`elif doi.startswith("https://doi.org/10.5281/") or doi.startswith(`
			`"http://doi.org/10.5281/"`
			`):`
Add a urlopen helper to Zenodo content provider Use a helper function to inject a default user-agent header into every request we make. 2019-05-29 06:17:22 +00:00			`resp = self._urlopen(doi)`
Add tests for Zenodo content provider 2019-05-28 17:10:32 +00:00			`self.record_id = resp.url.rsplit("/", maxsplit=1)[1]`
			`return {"record": self.record_id}`

			`elif doi.startswith("https://zenodo.org/record/") or doi.startswith(`
			`"http://zenodo.org/record/"`
			`):`
			`self.record_id = doi.rsplit("/", maxsplit=1)[1]`
			`return {"record": self.record_id}`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00
			`def fetch(self, spec, output_dir, yield_output=False):`
Update docstrings in Zenodo provider 2019-05-28 17:28:05 +00:00			`"""Fetch and unpack a Zenodo record"""`
Add tests for Zenodo content provider 2019-05-28 17:10:32 +00:00			`record_id = spec["record"]`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00
			`yield "Fetching Zenodo record {}.\n".format(record_id)`
Add tests for Zenodo content provider 2019-05-28 17:10:32 +00:00			`req = Request(`
			`"https://zenodo.org/api/records/{}".format(record_id),`
			`headers={"accept": "application/json"},`
			`)`
Add a urlopen helper to Zenodo content provider Use a helper function to inject a default user-agent header into every request we make. 2019-05-29 06:17:22 +00:00			`resp = self._urlopen(req)`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00
			`record = json.loads(resp.read().decode("utf-8"))`

			`def _fetch(file_ref, unzip=False):`
Better handling of software archives Unpack a single ZIP file, then move it to the root of the directory 2019-05-27 17:53:20 +00:00			# the assumption is that `unzip=True` means that this is the only
			`# file related to the zenodo record`
Add a urlopen helper to Zenodo content provider Use a helper function to inject a default user-agent header into every request we make. 2019-05-29 06:17:22 +00:00			`with self._urlopen(file_ref["links"]["download"]) as src:`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00			`fname = file_ref["filename"]`
Better handling of software archives Unpack a single ZIP file, then move it to the root of the directory 2019-05-27 17:53:20 +00:00			`if path.dirname(fname):`
			`sub_dir = path.join(output_dir, path.dirname(fname))`
			`if not path.exists(sub_dir):`
Add tests for Zenodo content provider 2019-05-28 17:10:32 +00:00			`yield "Creating {}\n".format(sub_dir)`
Better handling of software archives Unpack a single ZIP file, then move it to the root of the directory 2019-05-27 17:53:20 +00:00			`makedirs(sub_dir, exist_ok=True)`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00
			`dst_fname = path.join(output_dir, fname)`
			`with open(dst_fname, "wb") as dst:`
			`yield "Fetching {}\n".format(fname)`
			`shutil.copyfileobj(src, dst)`
			`# first close the newly written file, then continue`
			`# processing it`
			`if unzip and is_zipfile(dst_fname):`
Better handling of software archives Unpack a single ZIP file, then move it to the root of the directory 2019-05-27 17:53:20 +00:00			`yield "Extracting {}\n".format(fname)`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00			`zfile = ZipFile(dst_fname)`
			`zfile.extractall(path=output_dir)`
			`zfile.close()`
Better handling of software archives Unpack a single ZIP file, then move it to the root of the directory 2019-05-27 17:53:20 +00:00
			`# delete downloaded file ...`
			`os.remove(dst_fname)`
			`# ... and any directories we might have created,`
			`# in which case sub_dir will be defined`
			`if path.dirname(fname):`
			`shutil.rmtree(sub_dir)`

			`new_subdirs = os.listdir(output_dir)`
			`# if there is only one new subdirectory move its contents`
			`# to the top level directory`
			`if len(new_subdirs) == 1:`
			`d = new_subdirs[0]`
			`copytree(path.join(output_dir, d), output_dir)`
			`shutil.rmtree(path.join(output_dir, d))`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00
			`is_software = record["metadata"]["upload_type"] == "software"`
			`only_one_file = len(record["files"]) == 1`
Add tests for Zenodo content provider 2019-05-28 17:10:32 +00:00			`for file_ref in record["files"]:`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00			`for line in _fetch(file_ref, unzip=is_software and only_one_file):`
			`yield line`

			`@property`
			`def content_id(self):`
Update docstrings in Zenodo provider 2019-05-28 17:28:05 +00:00			`"""The Zenodo record ID as the content of a record is immutable"""`
Add basic Zenodo content provider 2019-05-27 15:32:03 +00:00			`return self.record_id`