Add basic Zenodo content provider

pull/693/head
Tim Head 2019-05-27 17:32:03 +02:00
rodzic 51898274f8
commit dce6c1e8d7
4 zmienionych plików z 165 dodań i 0 usunięć

Wyświetl plik

@ -136,6 +136,7 @@ class Repo2Docker(Application):
content_providers = List(
[
contentproviders.Local,
contentproviders.Zenodo,
contentproviders.Git,
],
config=True,

Wyświetl plik

@ -1,2 +1,3 @@
from .git import Git
from .base import Local
from .zenodo import Zenodo

Wyświetl plik

@ -0,0 +1,73 @@
import json
import shutil
from os import makedirs
from os import path
from urllib.request import urlopen, Request
from zipfile import ZipFile, is_zipfile
from .base import ContentProvider
from ..utils import copytree
class Zenodo(ContentProvider):
"""Provide contents of a Zenodo deposit."""
def detect(self, doi, ref=None, extra_args=None):
# 10.5281 is the Zenodo DOI prefix
if doi.startswith('10.5281'):
resp = urlopen("https://doi.org/{}".format(doi))
self.record_id = resp.url.rsplit("/", maxsplit=1)[1]
return {'record': self.record_id}
def fetch(self, spec, output_dir, yield_output=False):
record_id = spec['record']
yield "Fetching Zenodo record {}.\n".format(record_id)
req = Request("https://zenodo.org/api/records/{}".format(record_id),
headers={"accept": "application/json"})
resp = urlopen(req)
record = json.loads(resp.read().decode("utf-8"))
def _fetch(file_ref, unzip=False):
with urlopen(file_ref["links"]["download"]) as src:
fname = file_ref["filename"]
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
print("Creating", sub_dir)
makedirs(sub_dir, exist_ok=True)
dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield "Fetching {}\n".format(fname)
shutil.copyfileobj(src, dst)
# first close the newly written file, then continue
# processing it
if unzip and is_zipfile(dst_fname):
zfile = ZipFile(dst_fname)
zfile.extractall(path=output_dir)
zfile.close()
import os
d = os.listdir(output_dir)[0]
print(output_dir)
print(os.listdir(output_dir))
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(sub_dir)
shutil.rmtree(path.join(output_dir, d))
is_software = record["metadata"]["upload_type"] == "software"
only_one_file = len(record["files"]) == 1
for file_ref in record['files']:
for line in _fetch(file_ref, unzip=is_software and only_one_file):
yield line
import pdb; pdb.set_trace()
@property
def content_id(self):
"""A unique ID to represent the version of the content.
Uses the first seven characters of the git commit ID of the repository.
"""
return self.record_id

Wyświetl plik

@ -4,6 +4,8 @@ import os
import re
import subprocess
from shutil import copystat, copy2
from traitlets import Integer, TraitError
@ -287,3 +289,91 @@ def check_ref(ref, cwd=None):
# We'll throw an error later if no refs resolve
pass
return hash
class Error(OSError):
pass
# a copy of shutil.copytree() that is ok with the target directory
# already existing
def copytree(src, dst, symlinks=False, ignore=None, copy_function=copy2,
ignore_dangling_symlinks=False):
"""Recursively copy a directory tree.
The destination directory must not already exist.
If exception(s) occur, an Error is raised with a list of reasons.
If the optional symlinks flag is true, symbolic links in the
source tree result in symbolic links in the destination tree; if
it is false, the contents of the files pointed to by symbolic
links are copied. If the file pointed by the symlink doesn't
exist, an exception will be added in the list of errors raised in
an Error exception at the end of the copy process.
You can set the optional ignore_dangling_symlinks flag to true if you
want to silence this exception. Notice that this has no effect on
platforms that don't support os.symlink.
The optional ignore argument is a callable. If given, it
is called with the `src` parameter, which is the directory
being visited by copytree(), and `names` which is the list of
`src` contents, as returned by os.listdir():
callable(src, names) -> ignored_names
Since copytree() is called recursively, the callable will be
called once for each directory that is copied. It returns a
list of names relative to the `src` directory that should
not be copied.
The optional copy_function argument is a callable that will be used
to copy each file. It will be called with the source path and the
destination path as arguments. By default, copy2() is used, but any
function that supports the same signature (like copy()) can be used.
"""
names = os.listdir(src)
if ignore is not None:
ignored_names = ignore(src, names)
else:
ignored_names = set()
os.makedirs(dst, exist_ok=True)
errors = []
for name in names:
if name in ignored_names:
continue
srcname = os.path.join(src, name)
dstname = os.path.join(dst, name)
try:
if os.path.islink(srcname):
linkto = os.readlink(srcname)
if symlinks:
# We can't just leave it to `copy_function` because legacy
# code with a custom `copy_function` may rely on copytree
# doing the right thing.
os.symlink(linkto, dstname)
copystat(srcname, dstname, follow_symlinks=not symlinks)
else:
# ignore dangling symlink if the flag is on
if not os.path.exists(linkto) and ignore_dangling_symlinks:
continue
# otherwise let the copy occurs. copy2 will raise an error
if os.path.isdir(srcname):
copytree(srcname, dstname, symlinks, ignore,
copy_function)
else:
copy_function(srcname, dstname)
elif os.path.isdir(srcname):
copytree(srcname, dstname, symlinks, ignore, copy_function)
else:
# Will raise a SpecialFileError for unsupported file types
copy_function(srcname, dstname)
# catch the Error from the recursive copytree so that we can
# continue with other files
except Error as err:
errors.extend(err.args[0])
except OSError as why:
errors.append((srcname, dstname, str(why)))
try:
copystat(src, dst)
except OSError as why:
# Copying file access times may fail on Windows
if getattr(why, 'winerror', None) is None:
errors.append((src, dst, str(why)))
if errors:
raise Error(errors)
return dst