2019-05-27 17:53:20 +00:00
|
|
|
import os
|
2019-05-27 15:32:03 +00:00
|
|
|
import json
|
|
|
|
import shutil
|
|
|
|
|
|
|
|
from os import makedirs
|
|
|
|
from os import path
|
2019-05-29 06:17:22 +00:00
|
|
|
from urllib.request import build_opener, urlopen, Request
|
2019-05-27 15:32:03 +00:00
|
|
|
from zipfile import ZipFile, is_zipfile
|
2019-06-13 18:09:34 +00:00
|
|
|
from idutils import normalize_doi, is_doi
|
2019-05-27 15:32:03 +00:00
|
|
|
|
|
|
|
from .base import ContentProvider
|
|
|
|
from ..utils import copytree
|
2019-05-29 06:17:22 +00:00
|
|
|
from .. import __version__
|
2019-05-27 15:32:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Zenodo(ContentProvider):
|
|
|
|
"""Provide contents of a Zenodo deposit."""
|
|
|
|
|
2019-05-29 06:17:22 +00:00
|
|
|
def _urlopen(self, req, headers=None):
|
|
|
|
"""A urlopen() helper"""
|
|
|
|
# someone passed a string, not a request
|
|
|
|
if not isinstance(req, Request):
|
|
|
|
req = Request(req)
|
|
|
|
|
|
|
|
req.add_header("User-Agent", "repo2docker {}".format(__version__))
|
|
|
|
if headers is not None:
|
|
|
|
for key, value in headers.items():
|
|
|
|
req.add_header(key, value)
|
|
|
|
|
|
|
|
return urlopen(req)
|
|
|
|
|
2019-06-13 19:22:31 +00:00
|
|
|
def _doi2url(self, doi):
|
2019-06-13 18:14:16 +00:00
|
|
|
# Transform a DOI to a URL
|
|
|
|
# If not a doi, assume we have a URL and return
|
2019-06-13 18:09:34 +00:00
|
|
|
if is_doi(doi):
|
|
|
|
doi = normalize_doi(doi)
|
|
|
|
|
2019-06-13 18:27:24 +00:00
|
|
|
resp = self._urlopen("https://doi.org/{}".format(doi))
|
|
|
|
return resp.url
|
2019-06-13 18:09:34 +00:00
|
|
|
else:
|
|
|
|
return doi
|
|
|
|
|
2019-05-27 15:32:03 +00:00
|
|
|
def detect(self, doi, ref=None, extra_args=None):
|
2019-06-13 18:09:34 +00:00
|
|
|
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
|
|
|
|
# We need the hostname (url where records are), api url (for metadata),
|
|
|
|
# filepath (path to files in metadata), filename (path to filename in
|
|
|
|
# metadata), type (path to type in metadata)
|
2019-06-13 18:14:16 +00:00
|
|
|
|
|
|
|
hosts = [
|
|
|
|
{
|
|
|
|
"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
|
|
|
|
"api": "https://zenodo.org/api/records/",
|
|
|
|
"filepath": "files",
|
|
|
|
"filename": "files.key",
|
|
|
|
"download": "links.download",
|
|
|
|
"type": "metadata.upload_type",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"hostname": [
|
|
|
|
"https://data.caltech.edu/records/",
|
|
|
|
"http://data.caltech.edu/records/",
|
|
|
|
],
|
|
|
|
"api": "https://data.caltech.edu/api/record/",
|
|
|
|
"filepath": "files",
|
|
|
|
"filename": "electronic_location_and_access.electronic_name.0",
|
|
|
|
"type": "metadata.resourceType.resourceTypeGeneral",
|
|
|
|
},
|
|
|
|
]
|
2019-06-13 18:09:34 +00:00
|
|
|
|
2019-06-13 19:22:31 +00:00
|
|
|
url = self._doi2url(doi)
|
2019-05-27 15:32:03 +00:00
|
|
|
|
2019-06-13 18:09:34 +00:00
|
|
|
for host in hosts:
|
2019-06-13 18:14:16 +00:00
|
|
|
if any([url.startswith(s) for s in host["hostname"]]):
|
2019-06-13 18:27:24 +00:00
|
|
|
self.record_id = url.rsplit("/", maxsplit=1)[1]
|
2019-06-13 18:09:34 +00:00
|
|
|
return {"record": self.record_id, "host": host}
|
2019-06-13 18:14:16 +00:00
|
|
|
|
2019-05-27 15:32:03 +00:00
|
|
|
def fetch(self, spec, output_dir, yield_output=False):
|
2019-05-28 17:28:05 +00:00
|
|
|
"""Fetch and unpack a Zenodo record"""
|
2019-05-28 17:10:32 +00:00
|
|
|
record_id = spec["record"]
|
2019-06-13 18:09:34 +00:00
|
|
|
host = spec["host"]
|
2019-05-27 15:32:03 +00:00
|
|
|
|
|
|
|
yield "Fetching Zenodo record {}.\n".format(record_id)
|
2019-05-28 17:10:32 +00:00
|
|
|
req = Request(
|
2019-06-13 18:14:16 +00:00
|
|
|
"{}{}".format(host["api"], record_id),
|
2019-05-28 17:10:32 +00:00
|
|
|
headers={"accept": "application/json"},
|
|
|
|
)
|
2019-05-29 06:17:22 +00:00
|
|
|
resp = self._urlopen(req)
|
2019-05-27 15:32:03 +00:00
|
|
|
|
|
|
|
record = json.loads(resp.read().decode("utf-8"))
|
|
|
|
|
|
|
|
def _fetch(file_ref, unzip=False):
|
2019-05-27 17:53:20 +00:00
|
|
|
# the assumption is that `unzip=True` means that this is the only
|
|
|
|
# file related to the zenodo record
|
2019-05-29 06:17:22 +00:00
|
|
|
with self._urlopen(file_ref["links"]["download"]) as src:
|
2019-05-27 15:32:03 +00:00
|
|
|
fname = file_ref["filename"]
|
2019-05-27 17:53:20 +00:00
|
|
|
if path.dirname(fname):
|
|
|
|
sub_dir = path.join(output_dir, path.dirname(fname))
|
|
|
|
if not path.exists(sub_dir):
|
2019-05-28 17:10:32 +00:00
|
|
|
yield "Creating {}\n".format(sub_dir)
|
2019-05-27 17:53:20 +00:00
|
|
|
makedirs(sub_dir, exist_ok=True)
|
2019-05-27 15:32:03 +00:00
|
|
|
|
|
|
|
dst_fname = path.join(output_dir, fname)
|
|
|
|
with open(dst_fname, "wb") as dst:
|
|
|
|
yield "Fetching {}\n".format(fname)
|
|
|
|
shutil.copyfileobj(src, dst)
|
|
|
|
# first close the newly written file, then continue
|
|
|
|
# processing it
|
|
|
|
if unzip and is_zipfile(dst_fname):
|
2019-05-27 17:53:20 +00:00
|
|
|
yield "Extracting {}\n".format(fname)
|
2019-05-27 15:32:03 +00:00
|
|
|
zfile = ZipFile(dst_fname)
|
|
|
|
zfile.extractall(path=output_dir)
|
|
|
|
zfile.close()
|
2019-05-27 17:53:20 +00:00
|
|
|
|
|
|
|
# delete downloaded file ...
|
|
|
|
os.remove(dst_fname)
|
|
|
|
# ... and any directories we might have created,
|
|
|
|
# in which case sub_dir will be defined
|
|
|
|
if path.dirname(fname):
|
|
|
|
shutil.rmtree(sub_dir)
|
|
|
|
|
|
|
|
new_subdirs = os.listdir(output_dir)
|
|
|
|
# if there is only one new subdirectory move its contents
|
|
|
|
# to the top level directory
|
|
|
|
if len(new_subdirs) == 1:
|
|
|
|
d = new_subdirs[0]
|
|
|
|
copytree(path.join(output_dir, d), output_dir)
|
|
|
|
shutil.rmtree(path.join(output_dir, d))
|
2019-05-27 15:32:03 +00:00
|
|
|
|
|
|
|
is_software = record["metadata"]["upload_type"] == "software"
|
|
|
|
only_one_file = len(record["files"]) == 1
|
2019-05-28 17:10:32 +00:00
|
|
|
for file_ref in record["files"]:
|
2019-05-27 15:32:03 +00:00
|
|
|
for line in _fetch(file_ref, unzip=is_software and only_one_file):
|
|
|
|
yield line
|
|
|
|
|
|
|
|
@property
|
|
|
|
def content_id(self):
|
2019-05-28 17:28:05 +00:00
|
|
|
"""The Zenodo record ID as the content of a record is immutable"""
|
2019-05-27 15:32:03 +00:00
|
|
|
return self.record_id
|