repo2docker/repo2docker/contentproviders/zenodo.py

118 wiersze
4.6 KiB
Python
Czysty Zwykły widok Historia

import os
2019-05-27 15:32:03 +00:00
import json
import shutil
from os import makedirs
from os import path
from urllib.request import build_opener, urlopen, Request
2019-05-27 15:32:03 +00:00
from zipfile import ZipFile, is_zipfile
from .base import ContentProvider
from ..utils import copytree
from .. import __version__
2019-05-27 15:32:03 +00:00
class Zenodo(ContentProvider):
"""Provide contents of a Zenodo deposit."""
def _urlopen(self, req, headers=None):
"""A urlopen() helper"""
# someone passed a string, not a request
if not isinstance(req, Request):
req = Request(req)
req.add_header("User-Agent", "repo2docker {}".format(__version__))
if headers is not None:
for key, value in headers.items():
req.add_header(key, value)
return urlopen(req)
2019-05-27 15:32:03 +00:00
def detect(self, doi, ref=None, extra_args=None):
2019-05-28 17:28:05 +00:00
"""Trigger this provider for things that resolve to a Zenodo record"""
# To support Zenodo instances not hosted at zenodo.org we need to
# start maintaining a list of known DOI prefixes and their hostname.
# We should also change to returning a complete `record_url` that
# fetch() can use instead of constructing a URL there
2019-05-28 17:10:32 +00:00
doi = doi.lower()
2019-05-27 15:32:03 +00:00
# 10.5281 is the Zenodo DOI prefix
2019-05-28 17:10:32 +00:00
if doi.startswith("10.5281/"):
resp = self._urlopen("https://doi.org/{}".format(doi))
2019-05-27 15:32:03 +00:00
self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
2019-05-28 17:10:32 +00:00
return {"record": self.record_id}
elif doi.startswith("https://doi.org/10.5281/") or doi.startswith(
"http://doi.org/10.5281/"
):
resp = self._urlopen(doi)
2019-05-28 17:10:32 +00:00
self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
return {"record": self.record_id}
elif doi.startswith("https://zenodo.org/record/") or doi.startswith(
"http://zenodo.org/record/"
):
self.record_id = doi.rsplit("/", maxsplit=1)[1]
return {"record": self.record_id}
2019-05-27 15:32:03 +00:00
def fetch(self, spec, output_dir, yield_output=False):
2019-05-28 17:28:05 +00:00
"""Fetch and unpack a Zenodo record"""
2019-05-28 17:10:32 +00:00
record_id = spec["record"]
2019-05-27 15:32:03 +00:00
yield "Fetching Zenodo record {}.\n".format(record_id)
2019-05-28 17:10:32 +00:00
req = Request(
"https://zenodo.org/api/records/{}".format(record_id),
headers={"accept": "application/json"},
)
resp = self._urlopen(req)
2019-05-27 15:32:03 +00:00
record = json.loads(resp.read().decode("utf-8"))
def _fetch(file_ref, unzip=False):
# the assumption is that `unzip=True` means that this is the only
# file related to the zenodo record
with self._urlopen(file_ref["links"]["download"]) as src:
2019-05-27 15:32:03 +00:00
fname = file_ref["filename"]
if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
2019-05-28 17:10:32 +00:00
yield "Creating {}\n".format(sub_dir)
makedirs(sub_dir, exist_ok=True)
2019-05-27 15:32:03 +00:00
dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield "Fetching {}\n".format(fname)
shutil.copyfileobj(src, dst)
# first close the newly written file, then continue
# processing it
if unzip and is_zipfile(dst_fname):
yield "Extracting {}\n".format(fname)
2019-05-27 15:32:03 +00:00
zfile = ZipFile(dst_fname)
zfile.extractall(path=output_dir)
zfile.close()
# delete downloaded file ...
os.remove(dst_fname)
# ... and any directories we might have created,
# in which case sub_dir will be defined
if path.dirname(fname):
shutil.rmtree(sub_dir)
new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1:
d = new_subdirs[0]
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d))
2019-05-27 15:32:03 +00:00
is_software = record["metadata"]["upload_type"] == "software"
only_one_file = len(record["files"]) == 1
2019-05-28 17:10:32 +00:00
for file_ref in record["files"]:
2019-05-27 15:32:03 +00:00
for line in _fetch(file_ref, unzip=is_software and only_one_file):
yield line
@property
def content_id(self):
2019-05-28 17:28:05 +00:00
"""The Zenodo record ID as the content of a record is immutable"""
2019-05-27 15:32:03 +00:00
return self.record_id