kopia lustrzana https://github.com/jupyterhub/repo2docker
update hydroshare content provider to doi and add tests
rodzic
0549577a51
commit
d89f3a66aa
|
@ -6,42 +6,13 @@ import time
|
|||
from urllib.request import urlopen, Request, urlretrieve
|
||||
from urllib.error import HTTPError
|
||||
|
||||
from .base import ContentProvider
|
||||
from .doi import DoiProvider
|
||||
from ..utils import normalize_doi, is_doi
|
||||
|
||||
|
||||
class Hydroshare(ContentProvider):
|
||||
class Hydroshare(DoiProvider):
|
||||
"""Provide contents of a Hydroshare resource."""
|
||||
|
||||
def _urlopen(self, req, headers=None):
|
||||
"""A urlopen() helper"""
|
||||
# someone passed a string, not a request
|
||||
if not isinstance(req, Request):
|
||||
req = Request(req)
|
||||
|
||||
#req.add_header("User-Agent", "repo2docker {}".format(__version__))
|
||||
if headers is not None:
|
||||
for key, value in headers.items():
|
||||
req.add_header(key, value)
|
||||
|
||||
return urlopen(req)
|
||||
|
||||
def _doi2url(self, doi):
|
||||
# Transform a DOI to a URL
|
||||
# If not a doi, assume we have a URL and return
|
||||
if is_doi(doi):
|
||||
doi = normalize_doi(doi)
|
||||
|
||||
try:
|
||||
resp = self._urlopen("https://doi.org/{}".format(doi))
|
||||
# If the DOI doesn't resolve, just return URL
|
||||
except HTTPError:
|
||||
return doi
|
||||
return resp.url
|
||||
else:
|
||||
# Just return what is actulally just a URL
|
||||
return doi
|
||||
|
||||
def detect(self, doi, ref=None, extra_args=None):
|
||||
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
|
||||
# We need the hostname (url where records are), api url (for metadata),
|
||||
|
@ -54,14 +25,17 @@ class Hydroshare(ContentProvider):
|
|||
},
|
||||
]
|
||||
|
||||
url = self._doi2url(doi)
|
||||
url = self.doi2url(doi)
|
||||
|
||||
for host in hosts:
|
||||
if any([url.startswith(s) for s in host["hostname"]]):
|
||||
self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
|
||||
return {"resource": self.resource_id, "host": host}
|
||||
|
||||
def fetch(self, spec, output_dir, yield_output=False):
|
||||
def _urlretrieve(bag_url):
|
||||
return urlretrieve(bag_url)
|
||||
|
||||
def fetch(self, spec, output_dir, yield_output=False, timeout=120):
|
||||
"""Fetch and unpack a Hydroshare resource"""
|
||||
resource_id = spec["resource"]
|
||||
host = spec["host"]
|
||||
|
@ -71,19 +45,23 @@ class Hydroshare(ContentProvider):
|
|||
bag_url = "{}{}".format(host["django_irods"], resource_id)
|
||||
|
||||
# bag downloads are prepared on demand and may need some time
|
||||
conn = urlopen(bag_url)
|
||||
while conn.info().get_content_type() != "application/zip":
|
||||
conn = self.urlopen(bag_url)
|
||||
total_wait_time = 0
|
||||
while conn.getcode() == 200 and conn.info().get_content_type() != "application/zip":
|
||||
wait_time = 10
|
||||
total_wait_time += wait_time
|
||||
if total_wait_time > timeout:
|
||||
yield "Bag taking too long to prepare, exiting now, try again later."
|
||||
return
|
||||
yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time)
|
||||
time.sleep(wait_time)
|
||||
conn = self.urlopen(bag_url)
|
||||
if conn.getcode() != 200:
|
||||
yield "Failed to download bag. status code {}.\n".format(conn.getcode())
|
||||
return
|
||||
wait_time = 10
|
||||
yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time)
|
||||
time.sleep(wait_time)
|
||||
conn = urlopen(bag_url)
|
||||
|
||||
# Bag creation seems to need a small time buffer after it says it's ready.
|
||||
time.sleep(1)
|
||||
filehandle, _ = urlretrieve(bag_url)
|
||||
filehandle, _ = self._urlretrieve(bag_url)
|
||||
zip_file_object = zipfile.ZipFile(filehandle, 'r')
|
||||
yield "Downloaded, unpacking contents.\n"
|
||||
zip_file_object.extractall("temp")
|
||||
|
|
|
@ -0,0 +1,156 @@
|
|||
import json
|
||||
import os
|
||||
import pytest
|
||||
|
||||
from contextlib import contextmanager
|
||||
from io import BytesIO
|
||||
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
||||
from unittest.mock import patch
|
||||
from urllib.request import urlopen, Request, urlretrieve
|
||||
from zipfile import ZipFile
|
||||
|
||||
from repo2docker.contentproviders import Hydroshare
|
||||
|
||||
|
||||
def test_content_id():
|
||||
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||
fake_urlopen.return_value.url = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
|
||||
hydro = Hydroshare()
|
||||
|
||||
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
|
||||
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61"
|
||||
|
||||
|
||||
test_hosts = [
|
||||
(
|
||||
[
|
||||
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
|
||||
"10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
|
||||
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
|
||||
],
|
||||
{
|
||||
"host": {
|
||||
"hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"],
|
||||
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||
},
|
||||
"resource": "b8f6eae9d89241cf8b5904033460af61",
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
class MockInfo:
|
||||
def __init__(self, content_type):
|
||||
self.content_type = content_type
|
||||
|
||||
def get_content_type(self):
|
||||
return self.content_type
|
||||
|
||||
class MockResponse:
|
||||
def __init__(self, content_type, status_code):
|
||||
self.content_type = content_type
|
||||
self.status_code = status_code
|
||||
self.mock_info = MockInfo(self.content_type)
|
||||
|
||||
def getcode(self):
|
||||
return self.status_code
|
||||
|
||||
def info(self):
|
||||
return self.mock_info
|
||||
|
||||
@pytest.mark.parametrize("test_input,expected", test_hosts)
|
||||
def test_detect_hydroshare(test_input, expected):
|
||||
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||
fake_urlopen.return_value.url = test_input[0]
|
||||
# valid Hydroshare DOIs trigger this content provider
|
||||
assert Hydroshare().detect(test_input[0]) == expected
|
||||
assert Hydroshare().detect(test_input[1]) == expected
|
||||
assert Hydroshare().detect(test_input[2]) == expected
|
||||
# only two of the three calls above have to resolve a DOI
|
||||
assert fake_urlopen.call_count == 2
|
||||
|
||||
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||
# Don't trigger the Hydroshare content provider
|
||||
assert Hydroshare().detect("/some/path/here") is None
|
||||
assert Hydroshare().detect("https://example.com/path/here") is None
|
||||
# don't handle DOIs that aren't from Hydroshare
|
||||
fake_urlopen.return_value.url = (
|
||||
"http://joss.theoj.org/papers/10.21105/joss.01277"
|
||||
)
|
||||
assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
|
||||
|
||||
@contextmanager
|
||||
def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"):
|
||||
with NamedTemporaryFile(suffix=".zip") as zfile:
|
||||
with ZipFile(zfile.name, mode="w") as zip:
|
||||
zip.writestr("{}/some-file.txt".format(prefix), "some content")
|
||||
zip.writestr("{}/some-other-file.txt".format(prefix), "some more content")
|
||||
|
||||
yield zfile
|
||||
|
||||
def test_fetch_bag():
|
||||
# we "fetch" a local ZIP file to simulate a Hydroshare resource
|
||||
with hydroshare_archive() as hydro_path:
|
||||
with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200), MockResponse("application/zip", 200)]):
|
||||
with patch.object(Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]):
|
||||
hydro = Hydroshare()
|
||||
hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61"
|
||||
spec = {
|
||||
"host": {
|
||||
"hostname": [
|
||||
"https://www.hydroshare.org/resource/",
|
||||
"http://www.hydroshare.org/resource/",
|
||||
],
|
||||
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||
},
|
||||
"resource": "123456789",
|
||||
}
|
||||
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in hydro.fetch(spec, d):
|
||||
output.append(l)
|
||||
|
||||
unpacked_files = set(os.listdir(d))
|
||||
expected = set(["some-other-file.txt", "some-file.txt"])
|
||||
assert expected == unpacked_files
|
||||
|
||||
def test_fetch_bag_failure():
|
||||
with hydroshare_archive() as hydro_path:
|
||||
with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]):
|
||||
hydro = Hydroshare()
|
||||
spec = {
|
||||
"host": {
|
||||
"hostname": [
|
||||
"https://www.hydroshare.org/resource/",
|
||||
"http://www.hydroshare.org/resource/",
|
||||
],
|
||||
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||
},
|
||||
"resource": "123456789",
|
||||
}
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in hydro.fetch(spec, d):
|
||||
output.append(l)
|
||||
assert "Failed to download bag. status code 500.\n" == output[-1]
|
||||
|
||||
def test_fetch_bag_timeout():
|
||||
with hydroshare_archive() as hydro_path:
|
||||
with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]):
|
||||
hydro = Hydroshare()
|
||||
spec = {
|
||||
"host": {
|
||||
"hostname": [
|
||||
"https://www.hydroshare.org/resource/",
|
||||
"http://www.hydroshare.org/resource/",
|
||||
],
|
||||
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||
},
|
||||
"resource": "123456789",
|
||||
}
|
||||
with TemporaryDirectory() as d:
|
||||
output = []
|
||||
for l in hydro.fetch(spec, d, timeout=0):
|
||||
output.append(l)
|
||||
assert "Bag taking too long to prepare, exiting now, try again later." == output[-1]
|
||||
|
Ładowanie…
Reference in New Issue