repo2docker/repo2docker/contentproviders/dataverse.py

import json
import os
import shutil
from urllib.parse import parse_qs, urlparse, urlunparse

from ..utils import copytree, deep_get, is_doi
from .doi import DoiProvider


class Dataverse(DoiProvider):
    """
    Provide contents of a Dataverse dataset.

    This class loads a a list of existing Dataverse installations from the internal
    file dataverse.json. This file is manually updated with the following command:

        python setup.py generate_dataverse_file
    """

    def __init__(self):
        data_file = os.path.join(os.path.dirname(__file__), "dataverse.json")
        with open(data_file) as fp:
            self.hosts = json.load(fp)["installations"]
        super().__init__()

    def detect(self, spec, ref=None, extra_args=None):
        """
        Detect if given spec is hosted on dataverse

        The spec can be:
        - DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
        - DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
        - URL {siteURL}/api/access/datafile/{fileId}

        Examples:
        - https://dataverse.harvard.edu/api/access/datafile/3323458
        - doi:10.7910/DVN/6ZXAGT
        - doi:10.7910/DVN/6ZXAGT/3YRRYJ
        """
        if is_doi(spec):
            url = self.doi2url(spec)
        else:
            url = spec
        # Parse the url, to get the base for later API calls
        parsed_url = urlparse(url)

        # Check if the url matches any known Dataverse installation, bail if not.
        host = next(
            (
                host
                for host in self.hosts
                if urlparse(host["url"]).netloc == parsed_url.netloc
            ),
            None,
        )
        if host is None:
            return

        # At this point, we *know* this is a dataverse URL, because:
        # 1. The DOI resolved to a particular host (if using DOI)
        # 2. The host is in the list of known dataverse installations
        #
        # We don't know exactly what kind of dataverse object this is, but
        # that can be figured out during fetch as needed
        return {"host": host, "url": url}

    def get_persistent_id_from_url(self, url: str) -> str:
        """
        Return the persistentId for given dataverse URL.

        Supports the following *dataset* URL styles:
        - /citation: https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
        - /dataset.xhtml: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP

        Supports the following *file* URL styles:
        - /api/access/datafile: https://dataverse.harvard.edu/api/access/datafile/3323458

        Supports a subset of the following *file* URL styles:
        - /file.xhtml: https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ

        If a URL can not be parsed, throw an exception
        """
        parsed_url = urlparse(url)
        path = parsed_url.path
        qs = parse_qs(parsed_url.query)

        # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP
        # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP
        if path.startswith("/citation") or path.startswith("/dataset.xhtml"):
            return qs["persistentId"][0]
        #  https://dataverse.harvard.edu/api/access/datafile/3323458
        elif path.startswith("/api/access/datafile"):
            # What we have here is an entity id, which we can use to get a persistentId
            entity_id = os.path.basename(parsed_url.path)
            # FIXME: Should we be URL Encoding something here to protect from path traversal
            # or similar attacks?
            search_query = f"q=entityId:{entity_id}&type=file"
            search_api_url = urlunparse(
                parsed_url._replace(path="/api/search", query=search_query)
            )
            self.log.debug("Querying Dataverse: " + search_api_url)
            data = self.urlopen(search_api_url).json()["data"]
            if data["count_in_response"] != 1:
                raise ValueError(
                    f"Dataverse search query failed!\n - url: {url}\n - resp: {json.dumps(data)}\n"
                )
            return data["items"][0]["dataset_persistent_id"]
        elif parsed_url.path.startswith("/file.xhtml"):
            file_persistent_id = qs["persistentId"][0]
            dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0]
            if file_persistent_id == dataset_persistent_id:
                # We can't figure this one out, throw an error
                raise ValueError(f"Could not find dataset id for {url}")
            return dataset_persistent_id

        raise ValueError(f"Could not determine persistent id for dataverse URL {url}")

    def get_datafiles(self, host: str, persistent_id: str) -> list[dict]:
        """
        Return a list of dataFiles for given persistent_id
        """
        dataset_url = f"{host}/api/datasets/:persistentId?persistentId={persistent_id}"

        resp = self._request(dataset_url, headers={"accept": "application/json"})
        # Assume it's a dataset
        is_dataset = True
        if resp.status_code == 404:
            # It's possible this is a *file* persistent_id, not a dataset one
            file_url = f"{host}/api/files/:persistentId?persistentId={persistent_id}"
            resp = self._request(file_url, headers={"accept": "application/json"})

            if resp.status_code == 404:
                # This persistent id is just not here
                raise ValueError(f"{persistent_id} on {host} is not found")

            # It's not a dataset, it's a file!
            is_dataset = False

        # We already handled 404, raise error for everything else
        resp.raise_for_status()

        data = resp.json()["data"]

        if is_dataset:
            return data["latestVersion"]["files"]
        else:
            # Only one file object
            return [data]

    def fetch(self, spec, output_dir, yield_output=False):
        """Fetch and unpack a Dataverse dataset."""
        url = spec["url"]
        host = spec["host"]

        persistent_id = self.get_persistent_id_from_url(url)

        yield f"Fetching Dataverse record {persistent_id}.\n"

        for fobj in self.get_datafiles(host["url"], persistent_id):
            file_url = (
                # without format=original you get the preservation format (plain text, tab separated)
                f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
            )
            filename = fobj["label"]
            original_filename = fobj["dataFile"].get("originalFileName", None)
            if original_filename:
                # replace preservation format filename (foo.tab) with original filename (foo.dta)
                filename = original_filename

            filename_with_path = os.path.join(fobj.get("directoryLabel", ""), filename)

            file_ref = {"download": file_url, "filename": filename_with_path}
            fetch_map = {key: key for key in file_ref.keys()}

            yield from self.fetch_file(file_ref, fetch_map, output_dir)

        new_subdirs = os.listdir(output_dir)
        # if there is only one new subdirectory move its contents
        # to the top level directory
        if len(new_subdirs) == 1 and os.path.isdir(new_subdirs[0]):
            d = new_subdirs[0]
            copytree(os.path.join(output_dir, d), output_dir)
            shutil.rmtree(os.path.join(output_dir, d))

        # Save persistent id
        self.persitent_id = persistent_id

    @property
    def content_id(self):
        """The Dataverse persistent identifier."""
        return self.persistent_id