import os import json import shutil from urllib.request import Request from urllib.parse import urlparse, urlunparse from zipfile import ZipFile from .doi import DoiProvider from ..utils import copytree, deep_get, is_doi, normalize_doi class Dataverse(DoiProvider): """Provide contents of a Dataverse dataset.""" def __init__(self): data_file = os.path.join(os.path.dirname(__file__), "dataverse.json") with open(data_file, "r") as fp: self.hosts = json.load(fp)["installations"] super().__init__() def detect(self, doi, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Dataverse dataset. Handles: - DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId} - DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&... - URL {siteURL}/api/access/datafile/{fileId} Examples: - https://dataverse.harvard.edu/api/access/datafile/3323458 - doi:10.7910/DVN/6ZXAGT - doi:10.7910/DVN/6ZXAGT/3YRRYJ """ url = self.doi2url(doi) # Check if the url matches any known Dataverse installation, bail if not. host = next((host for host in self.hosts if url.startswith(host["url"])), None) if host is None: return # Parse the url, to get the base for later API calls parsed_url = urlparse(url) # Corner case handling if parsed_url.path.startswith("/file.xhtml"): # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi # is structured as "doi:/" and try to handle dataset that way. new_doi = doi.rsplit("/", 1)[0] if new_doi == doi: # tough luck :( Avoid inifite recursion and exit. return return self.detect(new_doi) elif parsed_url.path.startswith("/api/access/datafile"): # Raw url pointing to a datafile is a typical output from an External Tool integration entity_id = os.path.basename(parsed_url.path) search_query = "q=entityId:" + entity_id + "&type=file" # Knowing the file identifier query search api to get parent dataset search_url = urlunparse( parsed_url._replace(path="/api/search", query=search_query) ) resp = self.urlopen(search_url).read() data = json.loads(resp.decode("utf-8"))["data"] if data["count_in_response"] != 1: self.log.debug("Dataverse search query failed!") self.log.debug(" - doi = " + doi) self.log.debug(" - url = " + url) self.log.debug(" - resp = " + json.dumps(data)) return self.record_id = deep_get(data, "items.0.dataset_persistent_id") elif is_doi(doi): self.record_id = "doi:" + normalize_doi(doi) if hasattr(self, "record_id"): return {"record": self.record_id, "host": host} def fetch(self, spec, output_dir, yield_output=False): """Fetch and unpack a Dataverse dataset.""" record_id = spec["record"] host = spec["host"] yield "Fetching Dataverse record {}.\n".format(record_id) req = Request( "{}/api/datasets/:persistentId?persistentId={}".format( host["url"], record_id ), headers={"accept": "application/json"}, ) resp = self.urlopen(req) record = json.loads(resp.read().decode("utf-8"))["data"] # In order to fetch entire dataset we build a list of file IDs we want to fetch # and then receive a zip file containing all of them. # TODO: Dataverse has a limit for the zipfile size (see # https://github.com/jupyter/repo2docker/pull/739#issuecomment-510834729) # If size of the dataset is grater than 100MB individual files should be downloaded. file_ids = [ str(deep_get(fobj, "dataFile.id")) for fobj in deep_get(record, "latestVersion.files") ] req = Request( "{}/api/access/datafiles/{}".format(host["url"], ",".join(file_ids)) ) dst_fname = os.path.join(output_dir, "dataverse.zip") with self.urlopen(req) as src, open(dst_fname, "wb") as dst: yield "Fetching files bundle\n" shutil.copyfileobj(src, dst) yield "Extracting files\n" with ZipFile(dst_fname) as zfile: zfile.extractall(path=output_dir) os.remove(dst_fname) new_subdirs = os.listdir(output_dir) # if there is only one new subdirectory move its contents # to the top level directory if len(new_subdirs) == 1: d = new_subdirs[0] copytree(os.path.join(output_dir, d), output_dir) shutil.rmtree(os.path.join(output_dir, d)) @property def content_id(self): """The Dataverse persistent identifier (could use internal dataset_id too).""" return self.record_id