kopia lustrzana https://github.com/jupyterhub/repo2docker
Add Dataverse content provider
rodzic
649a2c6726
commit
331a610324
|
@ -4,3 +4,4 @@ include setup.cfg
|
|||
recursive-include repo2docker/buildpacks *
|
||||
include versioneer.py
|
||||
include repo2docker/_version.py
|
||||
include repo2docker/contentproviders/dataverse.json
|
||||
|
|
|
@ -146,6 +146,7 @@ class Repo2Docker(Application):
|
|||
contentproviders.Local,
|
||||
contentproviders.Zenodo,
|
||||
contentproviders.Figshare,
|
||||
contentproviders.Dataverse,
|
||||
contentproviders.Git,
|
||||
],
|
||||
config=True,
|
||||
|
|
|
@ -2,3 +2,4 @@ from .git import Git
|
|||
from .base import Local
|
||||
from .zenodo import Zenodo
|
||||
from .figshare import Figshare
|
||||
from .dataverse import Dataverse
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,129 @@
|
|||
import os
|
||||
import json
|
||||
import shutil
|
||||
|
||||
from urllib.request import Request
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from zipfile import ZipFile
|
||||
|
||||
from .doi import DoiProvider
|
||||
from ..utils import copytree, deep_get, is_doi, normalize_doi
|
||||
|
||||
|
||||
class Dataverse(DoiProvider):
|
||||
"""Provide contents of a Dataverse dataset."""
|
||||
|
||||
def __init__(self):
|
||||
data_file = os.path.join(os.path.dirname(__file__), "dataverse.json")
|
||||
with open(data_file, "r") as fp:
|
||||
self.hosts = json.load(fp)["installations"]
|
||||
super().__init__()
|
||||
|
||||
def detect(self, doi, ref=None, extra_args=None):
|
||||
"""Trigger this provider for things that resolve to a Dataverse dataset.
|
||||
|
||||
Handles:
|
||||
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
|
||||
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
|
||||
- URL {siteURL}/api/access/datafile/{fileId}
|
||||
|
||||
Examples:
|
||||
- https://dataverse.harvard.edu/api/access/datafile/3323458
|
||||
- doi:10.7910/DVN/6ZXAGT
|
||||
- doi:10.7910/DVN/6ZXAGT/3YRRYJ
|
||||
|
||||
"""
|
||||
url = self.doi2url(doi)
|
||||
|
||||
# Check if the url matches any known Dataverse installation, bail if not.
|
||||
host = next((host for host in self.hosts if url.startswith(host["url"])), None)
|
||||
if host is None:
|
||||
return
|
||||
|
||||
# Parse the url, to get the base for later API calls
|
||||
parsed_url = urlparse(url)
|
||||
|
||||
# Corner case handling
|
||||
if parsed_url.path.startswith("/file.xhtml"):
|
||||
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
|
||||
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
|
||||
new_doi = doi.rsplit("/", 1)[0]
|
||||
if new_doi == doi:
|
||||
# tough luck :( Avoid inifite recursion and exit.
|
||||
return
|
||||
return self.detect(new_doi)
|
||||
elif parsed_url.path.startswith("/api/access/datafile"):
|
||||
# Raw url pointing to a datafile is a typical output from an External Tool integration
|
||||
entity_id = os.path.basename(parsed_url.path)
|
||||
search_query = "q=entityId:" + entity_id + "&type=file"
|
||||
# Knowing the file identifier query search api to get parent dataset
|
||||
search_url = urlunparse(
|
||||
parsed_url._replace(path="/api/search", query=search_query)
|
||||
)
|
||||
resp = self.urlopen(search_url).read()
|
||||
data = json.loads(resp.decode("utf-8"))["data"]
|
||||
if data["count_in_response"] != 1:
|
||||
self.log.debug("Dataverse search query failed!")
|
||||
self.log.debug(" - doi = " + doi)
|
||||
self.log.debug(" - url = " + url)
|
||||
self.log.debug(" - resp = " + json.dumps(data))
|
||||
return
|
||||
|
||||
self.record_id = deep_get(data, "items.0.dataset_persistent_id")
|
||||
elif is_doi(doi):
|
||||
self.record_id = "doi:" + normalize_doi(doi)
|
||||
|
||||
if hasattr(self, "record_id"):
|
||||
return {"record": self.record_id, "host": host}
|
||||
|
||||
def fetch(self, spec, output_dir, yield_output=False):
|
||||
"""Fetch and unpack a Dataverse dataset."""
|
||||
record_id = spec["record"]
|
||||
host = spec["host"]
|
||||
|
||||
yield "Fetching Dataverse record {}.\n".format(record_id)
|
||||
req = Request(
|
||||
"{}/api/datasets/:persistentId?persistentId={}".format(
|
||||
host["url"], record_id
|
||||
),
|
||||
headers={"accept": "application/json"},
|
||||
)
|
||||
resp = self.urlopen(req)
|
||||
record = json.loads(resp.read().decode("utf-8"))["data"]
|
||||
|
||||
# In order to fetch entire dataset we build a list of file IDs we want to fetch
|
||||
# and then receive a zip file containing all of them.
|
||||
# TODO: Dataverse has a limit for the zipfile size (see
|
||||
# https://github.com/jupyter/repo2docker/pull/739#issuecomment-510834729)
|
||||
# If size of the dataset is grater than 100MB individual files should be downloaded.
|
||||
file_ids = [
|
||||
str(deep_get(fobj, "dataFile.id"))
|
||||
for fobj in deep_get(record, "latestVersion.files")
|
||||
]
|
||||
|
||||
req = Request(
|
||||
"{}/api/access/datafiles/{}".format(host["url"], ",".join(file_ids))
|
||||
)
|
||||
|
||||
dst_fname = os.path.join(output_dir, "dataverse.zip")
|
||||
with self.urlopen(req) as src, open(dst_fname, "wb") as dst:
|
||||
yield "Fetching files bundle\n"
|
||||
shutil.copyfileobj(src, dst)
|
||||
|
||||
yield "Extracting files\n"
|
||||
with ZipFile(dst_fname) as zfile:
|
||||
zfile.extractall(path=output_dir)
|
||||
|
||||
os.remove(dst_fname)
|
||||
new_subdirs = os.listdir(output_dir)
|
||||
# if there is only one new subdirectory move its contents
|
||||
# to the top level directory
|
||||
if len(new_subdirs) == 1:
|
||||
d = new_subdirs[0]
|
||||
copytree(os.path.join(output_dir, d), output_dir)
|
||||
shutil.rmtree(os.path.join(output_dir, d))
|
||||
|
||||
@property
|
||||
def content_id(self):
|
||||
"""The Dataverse persistent identifier (could use internal dataset_id too)."""
|
||||
return self.record_id
|
Ładowanie…
Reference in New Issue