repo2docker/repo2docker/contentproviders/doi.py

110 wiersze
3.5 KiB
Python

import os
import json
import shutil
import logging
from os import makedirs
from os import path
from requests import Session, HTTPError
from zipfile import ZipFile, is_zipfile
from .base import ContentProvider
from ..utils import copytree, deep_get
from ..utils import normalize_doi, is_doi
from .. import __version__
class DoiProvider(ContentProvider):
"""Provide contents of a repository identified by a DOI and some helper functions."""
def __init__(self):
super().__init__()
self.session = Session()
self.session.headers.update(
{
"user-agent": f"repo2docker {__version__}",
}
)
def _request(self, url, **kwargs):
return self.session.get(url, **kwargs)
urlopen = _request
def _urlopen(self, req, headers=None):
"""A urlopen() helper"""
# someone passed a string, not a request
if not isinstance(req, request.Request):
req = request.Request(req)
req.add_header("User-Agent", f"repo2docker {__version__}")
if headers is not None:
for key, value in headers.items():
req.add_header(key, value)
return request.urlopen(req)
def doi2url(self, doi):
# Transform a DOI to a URL
# If not a doi, assume we have a URL and return
if is_doi(doi):
doi = normalize_doi(doi)
try:
resp = self._request(f"https://doi.org/{doi}")
resp.raise_for_status()
# If the DOI doesn't resolve, just return URL
except HTTPError:
return doi
return resp.url
else:
# Just return what is actulally just a URL
return doi
def fetch_file(self, file_ref, host, output_dir, unzip=False):
# the assumption is that `unzip=True` means that this is the only
# file related to a record
file_url = deep_get(file_ref, host["download"])
fname = deep_get(file_ref, host["filename"])
logging.debug(f"Downloading file {file_url} as {fname}\n")
yield f"Requesting {file_url}\n"
resp = self._request(file_url, stream=True)
resp.raise_for_status()
if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
yield f"Creating {sub_dir}\n"
makedirs(sub_dir, exist_ok=True)
dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield f"Fetching {fname}\n"
for chunk in resp.iter_content(chunk_size=None):
dst.write(chunk)
if unzip and is_zipfile(dst_fname):
yield f"Extracting {fname}\n"
zfile = ZipFile(dst_fname)
zfile.extractall(path=output_dir)
zfile.close()
# delete downloaded file ...
os.remove(dst_fname)
# ... and any directories we might have created,
# in which case sub_dir will be defined
if path.dirname(fname):
shutil.rmtree(sub_dir)
new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1:
d = new_subdirs[0]
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d))
yield f"Fetched files: {os.listdir(output_dir)}\n"