kopia lustrzana https://github.com/jupyterhub/repo2docker
Support fetcing single files in dataverse
rodzic
172f8b017d
commit
1260a5a394
|
@ -106,7 +106,7 @@ class Dataverse(DoiProvider):
|
||||||
)
|
)
|
||||||
return data["items"][0]["dataset_persistent_id"]
|
return data["items"][0]["dataset_persistent_id"]
|
||||||
elif parsed_url.path.startswith("/file.xhtml"):
|
elif parsed_url.path.startswith("/file.xhtml"):
|
||||||
file_persistent_id = qs['persistentId'][0]
|
file_persistent_id = qs["persistentId"][0]
|
||||||
dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0]
|
dataset_persistent_id = file_persistent_id.rsplit("/", 1)[0]
|
||||||
if file_persistent_id == dataset_persistent_id:
|
if file_persistent_id == dataset_persistent_id:
|
||||||
# We can't figure this one out, throw an error
|
# We can't figure this one out, throw an error
|
||||||
|
@ -115,6 +115,38 @@ class Dataverse(DoiProvider):
|
||||||
|
|
||||||
raise ValueError(f"Could not determine persistent id for dataverse URL {url}")
|
raise ValueError(f"Could not determine persistent id for dataverse URL {url}")
|
||||||
|
|
||||||
|
def get_datafiles(self, host: str, persistent_id: str) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Return a list of dataFiles for given persistent_id
|
||||||
|
"""
|
||||||
|
dataset_url = f"{host}/api/datasets/:persistentId?persistentId={persistent_id}"
|
||||||
|
|
||||||
|
resp = self._request(dataset_url, headers={"accept": "application/json"})
|
||||||
|
# Assume it's a dataset
|
||||||
|
is_dataset = True
|
||||||
|
if resp.status_code == 404:
|
||||||
|
# It's possible this is a *file* persistent_id, not a dataset one
|
||||||
|
file_url = f"{host}/api/files/:persistentId?persistentId={persistent_id}"
|
||||||
|
resp = self._request(file_url, headers={"accept": "application/json"})
|
||||||
|
|
||||||
|
if resp.status_code == 404:
|
||||||
|
# This persistent id is just not here
|
||||||
|
raise ValueError(f"{persistent_id} on {host} is not found")
|
||||||
|
|
||||||
|
# It's not a dataset, it's a file!
|
||||||
|
is_dataset = False
|
||||||
|
|
||||||
|
# We already handled 404, raise error for everything else
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
data = resp.json()["data"]
|
||||||
|
|
||||||
|
if is_dataset:
|
||||||
|
return data["latestVersion"]["files"]
|
||||||
|
else:
|
||||||
|
# Only one file object
|
||||||
|
return [data]
|
||||||
|
|
||||||
def fetch(self, spec, output_dir, yield_output=False):
|
def fetch(self, spec, output_dir, yield_output=False):
|
||||||
"""Fetch and unpack a Dataverse dataset."""
|
"""Fetch and unpack a Dataverse dataset."""
|
||||||
url = spec["url"]
|
url = spec["url"]
|
||||||
|
@ -123,13 +155,8 @@ class Dataverse(DoiProvider):
|
||||||
persistent_id = self.get_persistent_id_from_url(url)
|
persistent_id = self.get_persistent_id_from_url(url)
|
||||||
|
|
||||||
yield f"Fetching Dataverse record {persistent_id}.\n"
|
yield f"Fetching Dataverse record {persistent_id}.\n"
|
||||||
url = f'{host["url"]}/api/datasets/:persistentId?persistentId={persistent_id}'
|
|
||||||
|
|
||||||
resp = self.urlopen(url, headers={"accept": "application/json"})
|
for fobj in self.get_datafiles(host["url"], persistent_id):
|
||||||
print(resp.json())
|
|
||||||
record = resp.json()["data"]
|
|
||||||
|
|
||||||
for fobj in deep_get(record, "latestVersion.files"):
|
|
||||||
file_url = (
|
file_url = (
|
||||||
# without format=original you get the preservation format (plain text, tab separated)
|
# without format=original you get the preservation format (plain text, tab separated)
|
||||||
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
|
f'{host["url"]}/api/access/datafile/{deep_get(fobj, "dataFile.id")}?format=original'
|
||||||
|
@ -155,7 +182,6 @@ class Dataverse(DoiProvider):
|
||||||
copytree(os.path.join(output_dir, d), output_dir)
|
copytree(os.path.join(output_dir, d), output_dir)
|
||||||
shutil.rmtree(os.path.join(output_dir, d))
|
shutil.rmtree(os.path.join(output_dir, d))
|
||||||
|
|
||||||
|
|
||||||
# Save persistent id
|
# Save persistent id
|
||||||
self.persitent_id = persistent_id
|
self.persitent_id = persistent_id
|
||||||
|
|
||||||
|
|
|
@ -10,19 +10,50 @@ test_dv = Dataverse()
|
||||||
harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")
|
harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")
|
||||||
cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")
|
cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("doi", "resolved"),
|
("doi", "resolved"),
|
||||||
[
|
[
|
||||||
("doi:10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}),
|
(
|
||||||
("10.7910/DVN/6ZXAGT/3YRRYJ", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"}),
|
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||||
("10.7910/DVN/TJCLKP", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP"}),
|
{
|
||||||
("https://dataverse.harvard.edu/api/access/datafile/3323458", {"host": harvard_dv, "url": "https://dataverse.harvard.edu/api/access/datafile/3323458"}),
|
"host": harvard_dv,
|
||||||
("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", {"host": cimmyt_dv, "url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016"}),
|
"url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||||
|
{
|
||||||
|
"host": harvard_dv,
|
||||||
|
"url": "https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"10.7910/DVN/TJCLKP",
|
||||||
|
{
|
||||||
|
"host": harvard_dv,
|
||||||
|
"url": "https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://dataverse.harvard.edu/api/access/datafile/3323458",
|
||||||
|
{
|
||||||
|
"host": harvard_dv,
|
||||||
|
"url": "https://dataverse.harvard.edu/api/access/datafile/3323458",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
|
||||||
|
{
|
||||||
|
"host": cimmyt_dv,
|
||||||
|
"url": "https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
|
||||||
|
},
|
||||||
|
),
|
||||||
("/some/random/string", None),
|
("/some/random/string", None),
|
||||||
("https://example.com/path/here", None),
|
("https://example.com/path/here", None),
|
||||||
# Non dataverse DOIs
|
# Non dataverse DOIs
|
||||||
("https://doi.org/10.21105/joss.01277", None)
|
("https://doi.org/10.21105/joss.01277", None),
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
def test_detect(doi, resolved):
|
def test_detect(doi, resolved):
|
||||||
assert Dataverse().detect(doi) == resolved
|
assert Dataverse().detect(doi) == resolved
|
||||||
|
@ -31,37 +62,61 @@ def test_detect(doi, resolved):
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
("url", "persistent_id"),
|
("url", "persistent_id"),
|
||||||
[
|
[
|
||||||
("https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ", "doi:10.7910/DVN/6ZXAGT"),
|
(
|
||||||
("https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP", "doi:10.7910/DVN/TJCLKP"),
|
"https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||||
("https://dataverse.harvard.edu/api/access/datafile/3323458", "doi:10.7910/DVN/3MJ7IR"),
|
"doi:10.7910/DVN/6ZXAGT",
|
||||||
("https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016", "hdl:11529/10016"),
|
),
|
||||||
]
|
(
|
||||||
|
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/TJCLKP",
|
||||||
|
"doi:10.7910/DVN/TJCLKP",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://dataverse.harvard.edu/api/access/datafile/3323458",
|
||||||
|
"doi:10.7910/DVN/3MJ7IR",
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
|
||||||
|
"hdl:11529/10016",
|
||||||
|
),
|
||||||
|
],
|
||||||
)
|
)
|
||||||
def test_get_persistent_id(url, persistent_id):
|
def test_get_persistent_id(url, persistent_id):
|
||||||
assert Dataverse().get_persistent_id_from_url(url) == persistent_id
|
assert Dataverse().get_persistent_id_from_url(url) == persistent_id
|
||||||
|
|
||||||
def test_dataverse_fetch():
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("spec", "md5tree"),
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"doi:10.7910/DVN/TJCLKP",
|
||||||
|
{
|
||||||
|
"data/primary/primary-data.zip": "a8f6fc3fc58f503cd48e23fa8b088694",
|
||||||
|
"data/2023-01-03.tsv": "6fd497bf13dab9a06fe737ebc22f1917",
|
||||||
|
"code/language.py": "9d61582bcf497c83bbd1ed0eed3c772e",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
# A citation targeting a single file
|
||||||
|
"https://dataverse.harvard.edu/citation?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ",
|
||||||
|
{
|
||||||
|
"ARCHAEOGLOBE_CONSENSUS_ASSESSMENT.tab": "17a91888ed8e91dfb63acbbab6127ac5"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_fetch(spec, md5tree):
|
||||||
dv = Dataverse()
|
dv = Dataverse()
|
||||||
spec = dv.detect("doi:10.7910/DVN/TJCLKP")
|
|
||||||
|
|
||||||
with TemporaryDirectory() as d:
|
with TemporaryDirectory() as d:
|
||||||
output = []
|
output = []
|
||||||
for l in dv.fetch(spec, d):
|
for l in dv.fetch(dv.detect(spec), d):
|
||||||
output.append(l)
|
output.append(l)
|
||||||
|
|
||||||
# Verify two directories
|
# Verify md5 sum of the files we expect to find
|
||||||
assert set(os.listdir(d)) == {"data", "code"}
|
# We are using md5 instead of something more secure because that is what
|
||||||
|
# dataverse itself uses
|
||||||
# Verify sha256sum of three files
|
for subpath, expected_sha in md5tree.items():
|
||||||
expected_sha = {
|
with open(os.path.join(d, subpath), "rb") as f:
|
||||||
'data/primary/primary-data.zip': '880f99a1e1d54a2553be61301f92e06b29236785b8d4d1b7ad0b4595d9d7512b',
|
h = hashlib.md5()
|
||||||
'data/2023-01-03.tsv': 'cc9759e8e6bc076dd7c1a8eb53a7ea3d38e8697fa9f544d15768db308516cc5f',
|
|
||||||
'code/language.py': '1ffb3b3cdc9de01279779f3fc88824672c8ec3ab1c41ecdd5c1b59a9b0202215'
|
|
||||||
}
|
|
||||||
|
|
||||||
for subpath, expected_sha in expected_sha.items():
|
|
||||||
with open(os.path.join(d, subpath), 'rb') as f:
|
|
||||||
h = hashlib.sha256()
|
|
||||||
h.update(f.read())
|
h.update(f.read())
|
||||||
assert h.hexdigest() == expected_sha
|
assert h.hexdigest() == expected_sha
|
||||||
|
|
Ładowanie…
Reference in New Issue