Add Dataverse content provider

pull/739/head
Kacper Kowalik (Xarthisius) 2019-07-10 17:44:03 -05:00
rodzic 649a2c6726
commit 331a610324
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 5D21B852895192F9
5 zmienionych plików z 133 dodań i 0 usunięć

Wyświetl plik

@ -4,3 +4,4 @@ include setup.cfg
recursive-include repo2docker/buildpacks *
include versioneer.py
include repo2docker/_version.py
include repo2docker/contentproviders/dataverse.json

Wyświetl plik

@ -146,6 +146,7 @@ class Repo2Docker(Application):
contentproviders.Local,
contentproviders.Zenodo,
contentproviders.Figshare,
contentproviders.Dataverse,
contentproviders.Git,
],
config=True,

Wyświetl plik

@ -2,3 +2,4 @@ from .git import Git
from .base import Local
from .zenodo import Zenodo
from .figshare import Figshare
from .dataverse import Dataverse

File diff suppressed because one or more lines are too long

Wyświetl plik

@ -0,0 +1,129 @@
import os
import json
import shutil
from urllib.request import Request
from urllib.parse import urlparse, urlunparse
from zipfile import ZipFile
from .doi import DoiProvider
from ..utils import copytree, deep_get, is_doi, normalize_doi
class Dataverse(DoiProvider):
"""Provide contents of a Dataverse dataset."""
def __init__(self):
data_file = os.path.join(os.path.dirname(__file__), "dataverse.json")
with open(data_file, "r") as fp:
self.hosts = json.load(fp)["installations"]
super().__init__()
def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Dataverse dataset.
Handles:
- DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
- DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
- URL {siteURL}/api/access/datafile/{fileId}
Examples:
- https://dataverse.harvard.edu/api/access/datafile/3323458
- doi:10.7910/DVN/6ZXAGT
- doi:10.7910/DVN/6ZXAGT/3YRRYJ
"""
url = self.doi2url(doi)
# Check if the url matches any known Dataverse installation, bail if not.
host = next((host for host in self.hosts if url.startswith(host["url"])), None)
if host is None:
return
# Parse the url, to get the base for later API calls
parsed_url = urlparse(url)
# Corner case handling
if parsed_url.path.startswith("/file.xhtml"):
# There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
# is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
new_doi = doi.rsplit("/", 1)[0]
if new_doi == doi:
# tough luck :( Avoid inifite recursion and exit.
return
return self.detect(new_doi)
elif parsed_url.path.startswith("/api/access/datafile"):
# Raw url pointing to a datafile is a typical output from an External Tool integration
entity_id = os.path.basename(parsed_url.path)
search_query = "q=entityId:" + entity_id + "&type=file"
# Knowing the file identifier query search api to get parent dataset
search_url = urlunparse(
parsed_url._replace(path="/api/search", query=search_query)
)
resp = self.urlopen(search_url).read()
data = json.loads(resp.decode("utf-8"))["data"]
if data["count_in_response"] != 1:
self.log.debug("Dataverse search query failed!")
self.log.debug(" - doi = " + doi)
self.log.debug(" - url = " + url)
self.log.debug(" - resp = " + json.dumps(data))
return
self.record_id = deep_get(data, "items.0.dataset_persistent_id")
elif is_doi(doi):
self.record_id = "doi:" + normalize_doi(doi)
if hasattr(self, "record_id"):
return {"record": self.record_id, "host": host}
def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Dataverse dataset."""
record_id = spec["record"]
host = spec["host"]
yield "Fetching Dataverse record {}.\n".format(record_id)
req = Request(
"{}/api/datasets/:persistentId?persistentId={}".format(
host["url"], record_id
),
headers={"accept": "application/json"},
)
resp = self.urlopen(req)
record = json.loads(resp.read().decode("utf-8"))["data"]
# In order to fetch entire dataset we build a list of file IDs we want to fetch
# and then receive a zip file containing all of them.
# TODO: Dataverse has a limit for the zipfile size (see
# https://github.com/jupyter/repo2docker/pull/739#issuecomment-510834729)
# If size of the dataset is grater than 100MB individual files should be downloaded.
file_ids = [
str(deep_get(fobj, "dataFile.id"))
for fobj in deep_get(record, "latestVersion.files")
]
req = Request(
"{}/api/access/datafiles/{}".format(host["url"], ",".join(file_ids))
)
dst_fname = os.path.join(output_dir, "dataverse.zip")
with self.urlopen(req) as src, open(dst_fname, "wb") as dst:
yield "Fetching files bundle\n"
shutil.copyfileobj(src, dst)
yield "Extracting files\n"
with ZipFile(dst_fname) as zfile:
zfile.extractall(path=output_dir)
os.remove(dst_fname)
new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1:
d = new_subdirs[0]
copytree(os.path.join(output_dir, d), output_dir)
shutil.rmtree(os.path.join(output_dir, d))
@property
def content_id(self):
"""The Dataverse persistent identifier (could use internal dataset_id too)."""
return self.record_id