Add Dataverse content provider

2019-07-10 17:44:03 -05:00 · 2019-07-10 17:44:03 -05:00 · 331a610324
commit 331a610324
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -4,3 +4,4 @@ include setup.cfg
 recursive-include repo2docker/buildpacks *
 include versioneer.py
 include repo2docker/_version.py
+include repo2docker/contentproviders/dataverse.json
--- a/repo2docker/app.py
+++ b/repo2docker/app.py
@ -146,6 +146,7 @@ class Repo2Docker(Application):
            contentproviders.Local,
            contentproviders.Zenodo,
            contentproviders.Figshare,
+            contentproviders.Dataverse,
            contentproviders.Git,
        ],
        config=True,
--- a/repo2docker/contentproviders/init.py
+++ b/repo2docker/contentproviders/init.py
@ -2,3 +2,4 @@ from .git import Git
 from .base import Local
 from .zenodo import Zenodo
 from .figshare import Figshare
+from .dataverse import Dataverse
--- a/repo2docker/contentproviders/dataverse.json
+++ b/repo2docker/contentproviders/dataverse.json
--- a/repo2docker/contentproviders/dataverse.py
+++ b/repo2docker/contentproviders/dataverse.py
@ -0,0 +1,129 @@
+import os
+import json
+import shutil
+
+from urllib.request import Request
+from urllib.parse import urlparse, urlunparse
+from zipfile import ZipFile
+
+from .doi import DoiProvider
+from ..utils import copytree, deep_get, is_doi, normalize_doi
+
+
+class Dataverse(DoiProvider):
+    """Provide contents of a Dataverse dataset."""
+
+    def __init__(self):
+        data_file = os.path.join(os.path.dirname(__file__), "dataverse.json")
+        with open(data_file, "r") as fp:
+            self.hosts = json.load(fp)["installations"]
+        super().__init__()
+
+    def detect(self, doi, ref=None, extra_args=None):
+        """Trigger this provider for things that resolve to a Dataverse dataset.
+
+        Handles:
+        - DOI pointing to {siteURL}/dataset.xhtml?persistentId={persistentId}
+        - DOI pointing to {siteURL}/file.xhtml?persistentId={persistentId}&...
+        - URL {siteURL}/api/access/datafile/{fileId}
+
+        Examples:
+        - https://dataverse.harvard.edu/api/access/datafile/3323458
+        - doi:10.7910/DVN/6ZXAGT
+        - doi:10.7910/DVN/6ZXAGT/3YRRYJ
+
+        """
+        url = self.doi2url(doi)
+
+        # Check if the url matches any known Dataverse installation, bail if not.
+        host = next((host for host in self.hosts if url.startswith(host["url"])), None)
+        if host is None:
+            return
+
+        # Parse the url, to get the base for later API calls
+        parsed_url = urlparse(url)
+
+        # Corner case handling
+        if parsed_url.path.startswith("/file.xhtml"):
+            # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi
+            # is structured as "doi:<dataset_doi>/<file_doi>" and try to handle dataset that way.
+            new_doi = doi.rsplit("/", 1)[0]
+            if new_doi == doi:
+                # tough luck :( Avoid inifite recursion and exit.
+                return
+            return self.detect(new_doi)
+        elif parsed_url.path.startswith("/api/access/datafile"):
+            # Raw url pointing to a datafile is a typical output from an External Tool integration
+            entity_id = os.path.basename(parsed_url.path)
+            search_query = "q=entityId:" + entity_id + "&type=file"
+            # Knowing the file identifier query search api to get parent dataset
+            search_url = urlunparse(
+                parsed_url._replace(path="/api/search", query=search_query)
+            )
+            resp = self.urlopen(search_url).read()
+            data = json.loads(resp.decode("utf-8"))["data"]
+            if data["count_in_response"] != 1:
+                self.log.debug("Dataverse search query failed!")
+                self.log.debug("  - doi = " + doi)
+                self.log.debug("  - url = " + url)
+                self.log.debug("  - resp = " + json.dumps(data))
+                return
+
+            self.record_id = deep_get(data, "items.0.dataset_persistent_id")
+        elif is_doi(doi):
+            self.record_id = "doi:" + normalize_doi(doi)
+
+        if hasattr(self, "record_id"):
+            return {"record": self.record_id, "host": host}
+
+    def fetch(self, spec, output_dir, yield_output=False):
+        """Fetch and unpack a Dataverse dataset."""
+        record_id = spec["record"]
+        host = spec["host"]
+
+        yield "Fetching Dataverse record {}.\n".format(record_id)
+        req = Request(
+            "{}/api/datasets/:persistentId?persistentId={}".format(
+                host["url"], record_id
+            ),
+            headers={"accept": "application/json"},
+        )
+        resp = self.urlopen(req)
+        record = json.loads(resp.read().decode("utf-8"))["data"]
+
+        # In order to fetch entire dataset we build a list of file IDs we want to fetch
+        # and then receive a zip file containing all of them.
+        # TODO: Dataverse has a limit for the zipfile size (see
+        # https://github.com/jupyter/repo2docker/pull/739#issuecomment-510834729)
+        # If size of the dataset is grater than 100MB individual files should be downloaded.
+        file_ids = [
+            str(deep_get(fobj, "dataFile.id"))
+            for fobj in deep_get(record, "latestVersion.files")
+        ]
+
+        req = Request(
+            "{}/api/access/datafiles/{}".format(host["url"], ",".join(file_ids))
+        )
+
+        dst_fname = os.path.join(output_dir, "dataverse.zip")
+        with self.urlopen(req) as src, open(dst_fname, "wb") as dst:
+            yield "Fetching files bundle\n"
+            shutil.copyfileobj(src, dst)
+
+        yield "Extracting files\n"
+        with ZipFile(dst_fname) as zfile:
+            zfile.extractall(path=output_dir)
+
+        os.remove(dst_fname)
+        new_subdirs = os.listdir(output_dir)
+        # if there is only one new subdirectory move its contents
+        # to the top level directory
+        if len(new_subdirs) == 1:
+            d = new_subdirs[0]
+            copytree(os.path.join(output_dir, d), output_dir)
+            shutil.rmtree(os.path.join(output_dir, d))
+
+    @property
+    def content_id(self):
+        """The Dataverse persistent identifier (could use internal dataset_id too)."""
+        return self.record_id