From 1260a5a394665d8de7ca8de09b18ddecb0ebfa3b Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 16 Dec 2024 17:02:25 -0800 Subject: [PATCH] Support fetcing single files in dataverse --- repo2docker/contentproviders/dataverse.py | 42 ++++++-- tests/contentproviders/test_dataverse.py | 113 ++++++++++++++++------ 2 files changed, 118 insertions(+), 37 deletions(-) diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index a8ee69fc..9b42b07f 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -106,7 +106,7 @@ class Dataverse(DoiProvider): ) return data["items"][0]["dataset_persistent_id"] elif parsed_url.path.startswith("/file.xhtml"): - file_persistent_id = qs['persistentId'][0] + file_persistent_id = qs["persistentId"][0] dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0] if file_persistent_id == dataset_persistent_id: # We can't figure this one out, throw an error @@ -115,6 +115,38 @@ class Dataverse(DoiProvider): raise ValueError(f"Could not determine persistent id for dataverse URL {url}") + def get_datafiles(self, host: str, persistent_id: str) -> list[dict]: + """ + Return a list of dataFiles for given persistent_id + """ + dataset_url = f"{host}/api/datasets/:persistentId?persistentId={persistent_id}" + + resp = self._request(dataset_url, headers={"accept": "application/json"}) + # Assume it's a dataset + is_dataset = True + if resp.status_code == 404: + # It's possible this is a *file* persistent_id, not a dataset one + file_url = f"{host}/api/files/:persistentId?persistentId={persistent_id}" + resp = self._request(file_url, headers={"accept": "application/json"}) + + if resp.status_code == 404: + # This persistent id is just not here + raise ValueError(f"{persistent_id} on {host} is not found") + + # It's not a dataset, it's a file! + is_dataset = False + + # We already handled 404, raise error for everything else + resp.raise_for_status() + + data = resp.json()["data"] + + if is_dataset: + return data["latestVersion"]["files"] + else: + # Only one file object + return [data] + def fetch(self, spec, output_dir, yield_output=False): """Fetch and unpack a Dataverse dataset.""" url = spec["url"] @@ -123,13 +155,8 @@ class Dataverse(DoiProvider): persistent_id = self.get_persistent_id_from_url(url) yield f"Fetching Dataverse record {persistent_id}.\n" - url = f'{host["url"]}/api/datasets/:persistentId?persistentId={persistent_id}' - resp = self.urlopen(url, headers={"accept": "application/json"}) - print(resp.json()) - record = resp.json()["data"] - - for fobj in deep_get(record, "latestVersion.files"): + for fobj in self.get_datafiles(host["url"], persistent_id): file_url = ( # without format=original you get the preservation format (plain text, tab separated) f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original' @@ -155,7 +182,6 @@ class Dataverse(DoiProvider): copytree(os.path.join(output_dir, d), output_dir) shutil.rmtree(os.path.join(output_dir, d)) - # Save persistent id self.persitent_id = persistent_id diff --git a/tests/contentproviders/test_dataverse.py b/tests/contentproviders/test_dataverse.py index 3fcefd32..e0c7e178 100644 --- a/tests/contentproviders/test_dataverse.py +++ b/tests/contentproviders/test_dataverse.py @@ -10,19 +10,50 @@ test_dv = Dataverse() harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse") cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data") + @pytest.mark.parametrize( ("doi", "resolved"), [ - ("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}), - ("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}), - ("10.7910/DVN/TJCLKP", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP"}), - ("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/api/access/datafile/3323458"}), - ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016"}), + ( + "doi:10.7910/DVN/6ZXAGT/3YRRYJ", + { + "host": harvard_dv, + "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + }, + ), + ( + "10.7910/DVN/6ZXAGT/3YRRYJ", + { + "host": harvard_dv, + "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + }, + ), + ( + "10.7910/DVN/TJCLKP", + { + "host": harvard_dv, + "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", + }, + ), + ( + "https://dataverse.harvard.edu/api/access/datafile/3323458", + { + "host": harvard_dv, + "url": "https://dataverse.harvard.edu/api/access/datafile/3323458", + }, + ), + ( + "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", + { + "host": cimmyt_dv, + "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", + }, + ), ("/some/random/string", None), ("https://example.com/path/here", None), # Non dataverse DOIs - ("https://doi.org/10.21105/joss.01277", None) - ] + ("https://doi.org/10.21105/joss.01277", None), + ], ) def test_detect(doi, resolved): assert Dataverse().detect(doi) == resolved @@ -31,37 +62,61 @@ def test_detect(doi, resolved): @pytest.mark.parametrize( ("url", "persistent_id"), [ - ("https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT"), - ("https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", "doi:10.7910/DVN/TJCLKP"), - ("https://dataverse.harvard.edu/api/access/datafile/3323458", "doi:10.7910/DVN/3MJ7IR"), - ("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", "hdl:11529/10016"), - ] + ( + "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + "doi:10.7910/DVN/6ZXAGT", + ), + ( + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", + "doi:10.7910/DVN/TJCLKP", + ), + ( + "https://dataverse.harvard.edu/api/access/datafile/3323458", + "doi:10.7910/DVN/3MJ7IR", + ), + ( + "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", + "hdl:11529/10016", + ), + ], ) def test_get_persistent_id(url, persistent_id): assert Dataverse().get_persistent_id_from_url(url) == persistent_id -def test_dataverse_fetch(): +@pytest.mark.parametrize( + ("spec", "md5tree"), + [ + ( + "doi:10.7910/DVN/TJCLKP", + { + "data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694", + "data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917", + "code/language.py": "9d61582bcf497c83bbd1ed0eed3c772e", + }, + ), + ( + # A citation targeting a single file + "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", + { + "ARCHAEOGLOBE_CONSENSUS_ASSESSMENT.tab": "17a91888ed8e91dfb63acbbab6127ac5" + } + ) + ], +) +def test_fetch(spec, md5tree): dv = Dataverse() - spec = dv.detect("doi:10.7910/DVN/TJCLKP") with TemporaryDirectory() as d: output = [] - for l in dv.fetch(spec, d): + for l in dv.fetch(dv.detect(spec), d): output.append(l) - # Verify two directories - assert set(os.listdir(d)) == {"data", "code"} - - # Verify sha256sum of three files - expected_sha = { - 'data/primary/primary-data.zip': '880f99a1e1d54a2553be61301f92e06b29236785b8d4d1b7ad0b4595d9d7512b', - 'data/2023-01-03.tsv': 'cc9759e8e6bc076dd7c1a8eb53a7ea3d38e8697fa9f544d15768db308516cc5f', - 'code/language.py': '1ffb3b3cdc9de01279779f3fc88824672c8ec3ab1c41ecdd5c1b59a9b0202215' - } - - for subpath, expected_sha in expected_sha.items(): - with open(os.path.join(d, subpath), 'rb') as f: - h = hashlib.sha256() + # Verify md5 sum of the files we expect to find + # We are using md5 instead of something more secure because that is what + # dataverse itself uses + for subpath, expected_sha in md5tree.items(): + with open(os.path.join(d, subpath), "rb") as f: + h = hashlib.md5() h.update(f.read()) - assert h.hexdigest() == expected_sha \ No newline at end of file + assert h.hexdigest() == expected_sha