repo2docker/repo2docker/contentproviders/zenodo.py

84 wiersze
3.0 KiB
Python

import json
import os
import shutil
from os import makedirs, path
from urllib.error import HTTPError
from urllib.request import Request
from ..utils import copytree, deep_get
from .doi import DoiProvider
class Zenodo(DoiProvider):
"""Provide contents of a Zenodo deposit."""
def __init__(self):
super().__init__()
# We need the hostname (url where records are), api url (for metadata),
# filepath (path to files in metadata), filename (path to filename in
# metadata), download (path to file download URL), and type (path to item type in metadata)
self.hosts = [
{
"hostname": [
"https://sandbox.zenodo.org/record/",
"http://sandbox.zenodo.org/record/",
],
"api": "https://sandbox.zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
{
"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
"api": "https://zenodo.org/api/records/",
"filepath": "files",
"filename": "filename",
"download": "links.download",
"type": "metadata.upload_type",
},
{
"hostname": [
"https://data.caltech.edu/records/",
"http://data.caltech.edu/records/",
],
"api": "https://data.caltech.edu/api/record/",
"filepath": "metadata.electronic_location_and_access",
"filename": "electronic_name.0",
"download": "uniform_resource_identifier",
"type": "metadata.resourceType.resourceTypeGeneral",
},
]
def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
url = self.doi2url(doi)
for host in self.hosts:
if any([url.startswith(s) for s in host["hostname"]]):
self.record_id = url.rsplit("/", maxsplit=1)[1]
return {"record": self.record_id, "host": host}
def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Zenodo record"""
record_id = spec["record"]
host = spec["host"]
yield f"Fetching Zenodo record {record_id}.\n"
resp = self.urlopen(
f'{host["api"]}{record_id}',
headers={"accept": "application/json"},
)
record = resp.json()
files = deep_get(record, host["filepath"])
only_one_file = len(files) == 1
for file_ref in files:
yield from self.fetch_file(file_ref, host, output_dir, unzip=only_one_file)
@property
def content_id(self):
"""The Zenodo record ID as the content of a record is immutable"""
return self.record_id