repo2docker/repo2docker/contentproviders/dataverse.py

import json
import os
import shutil
from urllib.parse import parse_qs, urlparse, urlunparse

from ..utils import copytree, deep_get
from .doi import DoiProvider


class Dataverse(DoiProvider):
    """
    Provide contents of a Dataverse dataset.

    This class loads a a list of existing Dataverse installations from the internal
    file dataverse.json. This file is manually updated with the following command:

        python setup.py generate_dataverse_file
    """

    def __init__(self):
        data_file = os.path.join(os.path.dirname(__file__), "dataverse.json")
        with open(data_file) as fp:
            self.hosts = json.load(fp)["installations"]
        super().__init__()

    def detect(self, doi, ref=None, extra_args=None):
        """Trigger this provider for things that resolve to a Dataverse dataset.

        Handles:
        - DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
        - DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
        - URL {siteURL}/api/access/datafile/{fileId}

        Examples:
        - https://dataverse.harvard.edu/api/access/datafile/3323458
        - doi:10.7910/DVN/6ZXAGT
        - doi:10.7910/DVN/6ZXAGT/3YRRYJ

        """
        url = self.doi2url(doi)
        # Parse the url, to get the base for later API calls
        parsed_url = urlparse(url)

        # Check if the url matches any known Dataverse installation, bail if not.
        host = next(
            (
                host
                for host in self.hosts
                if urlparse(host["url"]).netloc == parsed_url.netloc
            ),
            None,
        )
        if host is None:
            return

        query_args = parse_qs(parsed_url.query)
        # Corner case handling
        if parsed_url.path.startswith("/file.xhtml"):
            # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
            # is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
            new_doi = doi.rsplit("/", 1)[0]
            if new_doi == doi:
                # tough luck :( Avoid inifite recursion and exit.
                return
            return self.detect(new_doi)
        elif parsed_url.path.startswith("/api/access/datafile"):
            # Raw url pointing to a datafile is a typical output from an External Tool integration
            entity_id = os.path.basename(parsed_url.path)
            search_query = "q=entityId:" + entity_id + "&type=file"
            # Knowing the file identifier query search api to get parent dataset
            search_url = urlunparse(
                parsed_url._replace(path="/api/search", query=search_query)
            )
            self.log.debug("Querying Dataverse: " + search_url)
            data = self.urlopen(search_url).json()["data"]
            if data["count_in_response"] != 1:
                self.log.debug(
                    f"Dataverse search query failed!\n - doi: {doi}\n - url: {url}\n - resp: {json.dump(data)}\n"
                )
                return

            self.record_id = deep_get(data, "items.0.dataset_persistent_id")
        elif (
            parsed_url.path.startswith("/dataset.xhtml")
            and "persistentId" in query_args
        ):
            self.record_id = deep_get(query_args, "persistentId.0")

        if hasattr(self, "record_id"):
            return {"record": self.record_id, "host": host}

    def fetch(self, spec, output_dir, yield_output=False):
        """Fetch and unpack a Dataverse dataset."""
        record_id = spec["record"]
        host = spec["host"]

        yield f"Fetching Dataverse record {record_id}.\n"
        url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}'

        resp = self.urlopen(url, headers={"accept": "application/json"})
        record = resp.json()["data"]

        for fobj in deep_get(record, "latestVersion.files"):
            file_url = (
                f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}'
            )
            filename = os.path.join(fobj.get("directoryLabel", ""), fobj["label"])

            file_ref = {"download": file_url, "filename": filename}
            fetch_map = {key: key for key in file_ref.keys()}

            yield from self.fetch_file(file_ref, fetch_map, output_dir)

        new_subdirs = os.listdir(output_dir)
        # if there is only one new subdirectory move its contents
        # to the top level directory
        if len(new_subdirs) == 1 and os.path.isdir(new_subdirs[0]):
            d = new_subdirs[0]
            copytree(os.path.join(output_dir, d), output_dir)
            shutil.rmtree(os.path.join(output_dir, d))

    @property
    def content_id(self):
        """The Dataverse persistent identifier."""
        return self.record_id