repo2docker/tests/unit/contentproviders/test_dataverse.py

160 wiersze
5.1 KiB
Python
Czysty Zwykły widok Historia

2019-09-13 16:59:36 +00:00
import json
import os
import re
2019-09-13 16:59:36 +00:00
from io import BytesIO
from tempfile import TemporaryDirectory
2019-09-13 16:59:36 +00:00
from unittest.mock import patch
from urllib.request import Request, urlopen
2019-09-13 16:59:36 +00:00
import pytest
2019-09-13 16:59:36 +00:00
from repo2docker.contentproviders import Dataverse
2019-09-13 16:59:36 +00:00
test_dv = Dataverse()
harvard_dv = next(_ for _ in test_dv.hosts if _["name"] == "Harvard Dataverse")
cimmyt_dv = next(_ for _ in test_dv.hosts if _["name"] == "CIMMYT Research Data")
2019-09-13 16:59:36 +00:00
test_hosts = [
(
[
"doi:10.7910/DVN/6ZXAGT/3YRRYJ",
"10.7910/DVN/6ZXAGT",
"https://dataverse.harvard.edu/api/access/datafile/3323458",
"https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10016",
],
[
{"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"},
{"host": cimmyt_dv, "record": "hdl:11529/10016"},
2019-09-13 16:59:36 +00:00
],
)
]
doi_responses = {
"https://doi.org/10.7910/DVN/6ZXAGT/3YRRYJ": (
2019-09-13 16:59:36 +00:00
"https://dataverse.harvard.edu/file.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT/3YRRYJ"
),
"https://doi.org/10.7910/DVN/6ZXAGT": (
2019-09-13 16:59:36 +00:00
"https://dataverse.harvard.edu/dataset.xhtml"
"?persistentId=doi:10.7910/DVN/6ZXAGT"
),
"https://dataverse.harvard.edu/api/access/datafile/3323458": (
"https://dataverse.harvard.edu/api/access/datafile/3323458"
),
"https://doi.org/10.21105/joss.01277": (
"https://joss.theoj.org/papers/10.21105/joss.01277"
2019-09-13 16:59:36 +00:00
),
}
@pytest.mark.parametrize("test_input, expected", test_hosts)
def test_detect_dataverse(test_input, expected, requests_mock):
def doi_resolver(req, context):
resp = doi_responses.get(req.url)
# doi responses are redirects
if resp is not None:
context.status_code = 302
context.headers["Location"] = resp
return resp
requests_mock.get(re.compile("https://"), json=doi_resolver)
requests_mock.get(
"https://dataverse.harvard.edu/api/search?q=entityId:3323458&type=file",
json={
"data": {
"count_in_response": 1,
"items": [{"dataset_persistent_id": "doi:10.7910/DVN/6ZXAGT"}],
}
},
)
assert requests_mock.call_count == 0
# valid Dataverse DOIs trigger this content provider
assert Dataverse().detect(test_input[0]) == expected[0]
# 4: doi resolution (302), File, doi resolution (302), then dataset
assert requests_mock.call_count == 4
requests_mock.reset_mock()
assert Dataverse().detect(test_input[1]) == expected[0]
# 2: doi (302), dataset
assert requests_mock.call_count == 2
requests_mock.reset_mock()
assert Dataverse().detect(test_input[2]) == expected[0]
# 1: datafile (search dataverse for the file id)
assert requests_mock.call_count == 1
requests_mock.reset_mock()
assert Dataverse().detect(test_input[3]) == expected[1]
requests_mock.reset_mock()
# Don't trigger the Dataverse content provider
assert Dataverse().detect("/some/path/here") is None
assert Dataverse().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Dataverse
assert Dataverse().detect("https://doi.org/10.21105/joss.01277") is None
2019-09-13 16:59:36 +00:00
@pytest.fixture
def dv_files(tmpdir):
2019-09-13 16:59:36 +00:00
f1 = tmpdir.join("some-file.txt")
f1.write("some content")
2019-09-13 16:59:36 +00:00
f2 = tmpdir.mkdir("directory").join("some-other-file.txt")
f2.write("some other content")
2019-09-13 16:59:36 +00:00
f3 = tmpdir.join("directory").mkdir("subdirectory").join("the-other-file.txt")
f3.write("yet another content")
return [f1, f2, f3]
def test_dataverse_fetch(dv_files, requests_mock):
mock_response = {
"data": {
"latestVersion": {
"files": [
{"dataFile": {"id": 1}, "label": "some-file.txt"},
{
"dataFile": {"id": 2},
"label": "some-other-file.txt",
"directoryLabel": "directory",
},
{
"dataFile": {"id": 3},
"label": "the-other-file.txt",
"directoryLabel": "directory/subdirectory",
},
]
}
}
}
2019-09-13 16:59:36 +00:00
spec = {"host": harvard_dv, "record": "doi:10.7910/DVN/6ZXAGT"}
def mock_filecontent(req, context):
file_no = int(req.url.split("/")[-1]) - 1
return open(dv_files[file_no], "rb").read()
requests_mock.get(
"https://dataverse.harvard.edu/api/datasets/"
":persistentId?persistentId=doi:10.7910/DVN/6ZXAGT",
json=mock_response,
)
requests_mock.get(
re.compile("https://dataverse.harvard.edu/api/access/datafile"),
content=mock_filecontent,
)
2019-09-13 16:59:36 +00:00
dv = Dataverse()
with TemporaryDirectory() as d:
output = []
for l in dv.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = {"directory", "some-file.txt"}
assert expected == unpacked_files
assert os.path.isfile(
os.path.join(d, "directory", "subdirectory", "the-other-file.txt")
)