From b854b77be55c4ff20f3b8b1b21e896f405a80b8c Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Tue, 17 Dec 2024 11:50:10 -0800 Subject: [PATCH] Fix tests --- repo2docker/contentproviders/dataverse.py | 101 +++++++++++++--------- tests/contentproviders/test_dataverse.py | 40 +++------ 2 files changed, 71 insertions(+), 70 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 125af7a5..3cf8cb20 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -2,7 +2,7 @@ import hashlib import json import os import shutil -from typing import List +from typing import List, Tuple from urllib.parse import parse_qs, urlparse from ..utils import copytree, deep_get, is_doi @@ -67,29 +67,69 @@ class Dataverse(DoiProvider): # # We don't know exactly what kind of dataverse object this is, but # that can be figured out during fetch as needed - return {"host": host, "url": url} + return url - def get_dataset_id_from_file_id(self, host: str, file_id: str) -> str: + def get_dataset_id_from_file_id(self, base_url: str, file_id: str) -> str: """ Return the persistent_id (DOI) that a given file_id (int or doi) belongs to """ if file_id.isdigit(): # the file_id is an integer, rather than a persistent id (DOI) - api_url = f"{host}/api/files/{file_id}?returnDatasetVersion=true" + api_url = f"{base_url}/api/files/{file_id}?returnDatasetVersion=true" else: # the file_id is a doi itself - api_url = f"{host}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true" + api_url = f"{base_url}/api/files/:persistentId?persistentId={file_id}&returnDatasetVersion=true" resp = self._request(api_url) if resp.status_code == 404: - raise ValueError(f"File with id {file_id} not found in {host}") + raise ValueError(f"File with id {file_id} not found in {base_url}") resp.raise_for_status() data = resp.json()["data"] return data["datasetVersion"]["datasetPersistentId"] - def get_datafiles(self, dataverse_host: str, url: str) -> List[dict]: + def parse_dataverse_url(self, url: str) -> Tuple[str, bool]: + """ + Parse the persistent id out of a dataverse URL + + persistent_id can point to either a dataset or a file. The second return + value is False if we know that the persistent id is a file or a dataset, + and True if it is ambiguous. + + Raises a ValueError if we can not parse the url + """ + parsed_url= urlparse(url) + path = parsed_url.path + qs = parse_qs(parsed_url.query) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + + is_ambiguous = False + # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP + if path.startswith("/citation"): + is_ambiguous = True + persistent_id = qs["persistentId"][0] + # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP + elif path.startswith("/dataset.xhtml"): + # https://dataverse.harvard.edu/api/access/datafile/3323458 + persistent_id = qs["persistentId"][0] + elif path.startswith("/api/access/datafile"): + # What we have here is an entity id, which we can use to get a persistentId + file_id = os.path.basename(path) + persistent_id = self.get_dataset_id_from_file_id(base_url, file_id) + elif parsed_url.path.startswith("/file.xhtml"): + file_persistent_id = qs["persistentId"][0] + persistent_id = self.get_dataset_id_from_file_id( + base_url, file_persistent_id + ) + else: + raise ValueError( + f"Could not determine persistent id for dataverse URL {url}" + ) + + return persistent_id, is_ambiguous + + def get_datafiles(self, url: str) -> List[dict]: """ Return a list of dataFiles for given persistent_id @@ -107,52 +147,27 @@ class Dataverse(DoiProvider): """ parsed_url = urlparse(url) - path = parsed_url.path - qs = parse_qs(parsed_url.query) - dataverse_host = f"{parsed_url.scheme}://{parsed_url.netloc}" - url_kind = None - persistent_id = None - is_ambiguous = False + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - # https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP - if path.startswith("/citation"): - is_ambiguous = True - persistent_id = qs["persistentId"][0] - # https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/TJCLKP - elif path.startswith("/dataset.xhtml"): - # https://dataverse.harvard.edu/api/access/datafile/3323458 - persistent_id = qs["persistentId"][0] - elif path.startswith("/api/access/datafile"): - # What we have here is an entity id, which we can use to get a persistentId - file_id = os.path.basename(parsed_url.path) - persistent_id = self.get_dataset_id_from_file_id(dataverse_host, file_id) - elif parsed_url.path.startswith("/file.xhtml"): - file_persistent_id = qs["persistentId"][0] - persistent_id = self.get_dataset_id_from_file_id( - dataverse_host, file_persistent_id - ) - else: - raise ValueError( - f"Could not determine persistent id for dataverse URL {url}" - ) + persistent_id, is_ambiguous = self.parse_dataverse_url(url) dataset_api_url = ( - f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}" + f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}" ) resp = self._request(dataset_api_url, headers={"accept": "application/json"}) if resp.status_code == 404 and is_ambiguous: # It's possible this is a *file* persistent_id, not a dataset one persistent_id = self.get_dataset_id_from_file_id( - dataverse_host, persistent_id + base_url, persistent_id ) - dataset_api_url = f"{dataverse_host}/api/datasets/:persistentId?persistentId={persistent_id}" + dataset_api_url = f"{base_url}/api/datasets/:persistentId?persistentId={persistent_id}" resp = self._request( dataset_api_url, headers={"accept": "application/json"} ) if resp.status_code == 404: # This persistent id is just not here - raise ValueError(f"{persistent_id} on {dataverse_host} is not found") + raise ValueError(f"{persistent_id} on {base_url} is not found") # We already handled 404, raise error for everything else resp.raise_for_status() @@ -163,15 +178,17 @@ class Dataverse(DoiProvider): def fetch(self, spec, output_dir, yield_output=False): """Fetch and unpack a Dataverse dataset.""" - url = spec["url"] - host = spec["host"] + url = spec + parsed_url = urlparse(url) + # FIXME: Support determining API URL better + base_url = f'{parsed_url.scheme}://{parsed_url.netloc}' yield f"Fetching Dataverse record {url}.\n" - for fobj in self.get_datafiles(host["url"], url): + for fobj in self.get_datafiles(url): file_url = ( # without format=original you get the preservation format (plain text, tab separated) - f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' + f'{base_url}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' ) filename = fobj["label"] original_filename = fobj["dataFile"].get("originalFileName", None) diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py index cb5e06f3..7b6d1f60 100644 --- a/tests/contentproviders/test_dataverse.py +++ b/tests/contentproviders/test_dataverse.py @@ -6,48 +6,28 @@ import pytest from repo2docker.contentproviders import Dataverse -test_dv = Dataverse() -harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse") -cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data") - - @pytest.mark.parametrize( ("doi", "resolved"), [ ( "doi:10.7910/DVN/6ZXAGT/3YRRYJ", - { - "host": harvard_dv, - "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", - }, + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", ), ( "10.7910/DVN/6ZXAGT/3YRRYJ", - { - "host": harvard_dv, - "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", - }, + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", ), ( "10.7910/DVN/TJCLKP", - { - "host": harvard_dv, - "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", - }, + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", ), ( "https://dataverse.harvard.edu/api/access/datafile/3323458", - { - "host": harvard_dv, - "url": "https://dataverse.harvard.edu/api/access/datafile/3323458", - }, + "https://dataverse.harvard.edu/api/access/datafile/3323458", ), ( "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", - { - "host": cimmyt_dv, - "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", - }, + "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", ), ("/some/random/string", None), ("https://example.com/path/here", None), @@ -60,28 +40,32 @@ def test_detect(doi, resolved): @pytest.mark.parametrize( - ("url", "persistent_id"), + ("url", "persistent_id", "is_ambiguous"), [ ( "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT", + False ), ( "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", "doi:10.7910/DVN/TJCLKP", + True ), ( "https://dataverse.harvard.edu/api/access/datafile/3323458", "doi:10.7910/DVN/3MJ7IR", + False ), ( "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", "hdl:11529/10016", + False ), ], ) -def test_get_persistent_id(url, persistent_id): - assert Dataverse().get_persistent_id_from_url(url) == persistent_id +def test_get_persistent_id(url, persistent_id, is_ambiguous): + assert Dataverse().parse_dataverse_url(url) == (persistent_id, is_ambiguous) @pytest.mark.parametrize(