kopia lustrzana https://github.com/jupyterhub/repo2docker
update hydroshare content provider to doi and add tests
rodzic
0549577a51
commit
d89f3a66aa
|
@ -6,42 +6,13 @@ import time
|
||||||
from urllib.request import urlopen, Request, urlretrieve
|
from urllib.request import urlopen, Request, urlretrieve
|
||||||
from urllib.error import HTTPError
|
from urllib.error import HTTPError
|
||||||
|
|
||||||
from .base import ContentProvider
|
from .doi import DoiProvider
|
||||||
from ..utils import normalize_doi, is_doi
|
from ..utils import normalize_doi, is_doi
|
||||||
|
|
||||||
|
|
||||||
class Hydroshare(ContentProvider):
|
class Hydroshare(DoiProvider):
|
||||||
"""Provide contents of a Hydroshare resource."""
|
"""Provide contents of a Hydroshare resource."""
|
||||||
|
|
||||||
def _urlopen(self, req, headers=None):
|
|
||||||
"""A urlopen() helper"""
|
|
||||||
# someone passed a string, not a request
|
|
||||||
if not isinstance(req, Request):
|
|
||||||
req = Request(req)
|
|
||||||
|
|
||||||
#req.add_header("User-Agent", "repo2docker {}".format(__version__))
|
|
||||||
if headers is not None:
|
|
||||||
for key, value in headers.items():
|
|
||||||
req.add_header(key, value)
|
|
||||||
|
|
||||||
return urlopen(req)
|
|
||||||
|
|
||||||
def _doi2url(self, doi):
|
|
||||||
# Transform a DOI to a URL
|
|
||||||
# If not a doi, assume we have a URL and return
|
|
||||||
if is_doi(doi):
|
|
||||||
doi = normalize_doi(doi)
|
|
||||||
|
|
||||||
try:
|
|
||||||
resp = self._urlopen("https://doi.org/{}".format(doi))
|
|
||||||
# If the DOI doesn't resolve, just return URL
|
|
||||||
except HTTPError:
|
|
||||||
return doi
|
|
||||||
return resp.url
|
|
||||||
else:
|
|
||||||
# Just return what is actulally just a URL
|
|
||||||
return doi
|
|
||||||
|
|
||||||
def detect(self, doi, ref=None, extra_args=None):
|
def detect(self, doi, ref=None, extra_args=None):
|
||||||
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
|
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
|
||||||
# We need the hostname (url where records are), api url (for metadata),
|
# We need the hostname (url where records are), api url (for metadata),
|
||||||
|
@ -54,14 +25,17 @@ class Hydroshare(ContentProvider):
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
url = self._doi2url(doi)
|
url = self.doi2url(doi)
|
||||||
|
|
||||||
for host in hosts:
|
for host in hosts:
|
||||||
if any([url.startswith(s) for s in host["hostname"]]):
|
if any([url.startswith(s) for s in host["hostname"]]):
|
||||||
self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
|
self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
|
||||||
return {"resource": self.resource_id, "host": host}
|
return {"resource": self.resource_id, "host": host}
|
||||||
|
|
||||||
def fetch(self, spec, output_dir, yield_output=False):
|
def _urlretrieve(bag_url):
|
||||||
|
return urlretrieve(bag_url)
|
||||||
|
|
||||||
|
def fetch(self, spec, output_dir, yield_output=False, timeout=120):
|
||||||
"""Fetch and unpack a Hydroshare resource"""
|
"""Fetch and unpack a Hydroshare resource"""
|
||||||
resource_id = spec["resource"]
|
resource_id = spec["resource"]
|
||||||
host = spec["host"]
|
host = spec["host"]
|
||||||
|
@ -71,19 +45,23 @@ class Hydroshare(ContentProvider):
|
||||||
bag_url = "{}{}".format(host["django_irods"], resource_id)
|
bag_url = "{}{}".format(host["django_irods"], resource_id)
|
||||||
|
|
||||||
# bag downloads are prepared on demand and may need some time
|
# bag downloads are prepared on demand and may need some time
|
||||||
conn = urlopen(bag_url)
|
conn = self.urlopen(bag_url)
|
||||||
while conn.info().get_content_type() != "application/zip":
|
total_wait_time = 0
|
||||||
if conn.getcode() != 200:
|
while conn.getcode() == 200 and conn.info().get_content_type() != "application/zip":
|
||||||
yield "Failed to download bag. status code {}.\n".format(conn.getcode())
|
|
||||||
return
|
|
||||||
wait_time = 10
|
wait_time = 10
|
||||||
|
total_wait_time += wait_time
|
||||||
|
if total_wait_time > timeout:
|
||||||
|
yield "Bag taking too long to prepare, exiting now, try again later."
|
||||||
|
return
|
||||||
yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time)
|
yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time)
|
||||||
time.sleep(wait_time)
|
time.sleep(wait_time)
|
||||||
conn = urlopen(bag_url)
|
conn = self.urlopen(bag_url)
|
||||||
|
if conn.getcode() != 200:
|
||||||
|
yield "Failed to download bag. status code {}.\n".format(conn.getcode())
|
||||||
|
return
|
||||||
# Bag creation seems to need a small time buffer after it says it's ready.
|
# Bag creation seems to need a small time buffer after it says it's ready.
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
filehandle, _ = urlretrieve(bag_url)
|
filehandle, _ = self._urlretrieve(bag_url)
|
||||||
zip_file_object = zipfile.ZipFile(filehandle, 'r')
|
zip_file_object = zipfile.ZipFile(filehandle, 'r')
|
||||||
yield "Downloaded, unpacking contents.\n"
|
yield "Downloaded, unpacking contents.\n"
|
||||||
zip_file_object.extractall("temp")
|
zip_file_object.extractall("temp")
|
||||||
|
|
|
@ -0,0 +1,156 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from io import BytesIO
|
||||||
|
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
||||||
|
from unittest.mock import patch
|
||||||
|
from urllib.request import urlopen, Request, urlretrieve
|
||||||
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
from repo2docker.contentproviders import Hydroshare
|
||||||
|
|
||||||
|
|
||||||
|
def test_content_id():
|
||||||
|
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||||
|
fake_urlopen.return_value.url = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
|
||||||
|
hydro = Hydroshare()
|
||||||
|
|
||||||
|
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
|
||||||
|
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61"
|
||||||
|
|
||||||
|
|
||||||
|
test_hosts = [
|
||||||
|
(
|
||||||
|
[
|
||||||
|
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
|
||||||
|
"10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
|
||||||
|
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
|
||||||
|
],
|
||||||
|
{
|
||||||
|
"host": {
|
||||||
|
"hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"],
|
||||||
|
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||||
|
},
|
||||||
|
"resource": "b8f6eae9d89241cf8b5904033460af61",
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
class MockInfo:
|
||||||
|
def __init__(self, content_type):
|
||||||
|
self.content_type = content_type
|
||||||
|
|
||||||
|
def get_content_type(self):
|
||||||
|
return self.content_type
|
||||||
|
|
||||||
|
class MockResponse:
|
||||||
|
def __init__(self, content_type, status_code):
|
||||||
|
self.content_type = content_type
|
||||||
|
self.status_code = status_code
|
||||||
|
self.mock_info = MockInfo(self.content_type)
|
||||||
|
|
||||||
|
def getcode(self):
|
||||||
|
return self.status_code
|
||||||
|
|
||||||
|
def info(self):
|
||||||
|
return self.mock_info
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("test_input,expected", test_hosts)
|
||||||
|
def test_detect_hydroshare(test_input, expected):
|
||||||
|
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||||
|
fake_urlopen.return_value.url = test_input[0]
|
||||||
|
# valid Hydroshare DOIs trigger this content provider
|
||||||
|
assert Hydroshare().detect(test_input[0]) == expected
|
||||||
|
assert Hydroshare().detect(test_input[1]) == expected
|
||||||
|
assert Hydroshare().detect(test_input[2]) == expected
|
||||||
|
# only two of the three calls above have to resolve a DOI
|
||||||
|
assert fake_urlopen.call_count == 2
|
||||||
|
|
||||||
|
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||||
|
# Don't trigger the Hydroshare content provider
|
||||||
|
assert Hydroshare().detect("/some/path/here") is None
|
||||||
|
assert Hydroshare().detect("https://example.com/path/here") is None
|
||||||
|
# don't handle DOIs that aren't from Hydroshare
|
||||||
|
fake_urlopen.return_value.url = (
|
||||||
|
"http://joss.theoj.org/papers/10.21105/joss.01277"
|
||||||
|
)
|
||||||
|
assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"):
|
||||||
|
with NamedTemporaryFile(suffix=".zip") as zfile:
|
||||||
|
with ZipFile(zfile.name, mode="w") as zip:
|
||||||
|
zip.writestr("{}/some-file.txt".format(prefix), "some content")
|
||||||
|
zip.writestr("{}/some-other-file.txt".format(prefix), "some more content")
|
||||||
|
|
||||||
|
yield zfile
|
||||||
|
|
||||||
|
def test_fetch_bag():
|
||||||
|
# we "fetch" a local ZIP file to simulate a Hydroshare resource
|
||||||
|
with hydroshare_archive() as hydro_path:
|
||||||
|
with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200), MockResponse("application/zip", 200)]):
|
||||||
|
with patch.object(Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]):
|
||||||
|
hydro = Hydroshare()
|
||||||
|
hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61"
|
||||||
|
spec = {
|
||||||
|
"host": {
|
||||||
|
"hostname": [
|
||||||
|
"https://www.hydroshare.org/resource/",
|
||||||
|
"http://www.hydroshare.org/resource/",
|
||||||
|
],
|
||||||
|
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||||
|
},
|
||||||
|
"resource": "123456789",
|
||||||
|
}
|
||||||
|
|
||||||
|
with TemporaryDirectory() as d:
|
||||||
|
output = []
|
||||||
|
for l in hydro.fetch(spec, d):
|
||||||
|
output.append(l)
|
||||||
|
|
||||||
|
unpacked_files = set(os.listdir(d))
|
||||||
|
expected = set(["some-other-file.txt", "some-file.txt"])
|
||||||
|
assert expected == unpacked_files
|
||||||
|
|
||||||
|
def test_fetch_bag_failure():
|
||||||
|
with hydroshare_archive() as hydro_path:
|
||||||
|
with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]):
|
||||||
|
hydro = Hydroshare()
|
||||||
|
spec = {
|
||||||
|
"host": {
|
||||||
|
"hostname": [
|
||||||
|
"https://www.hydroshare.org/resource/",
|
||||||
|
"http://www.hydroshare.org/resource/",
|
||||||
|
],
|
||||||
|
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||||
|
},
|
||||||
|
"resource": "123456789",
|
||||||
|
}
|
||||||
|
with TemporaryDirectory() as d:
|
||||||
|
output = []
|
||||||
|
for l in hydro.fetch(spec, d):
|
||||||
|
output.append(l)
|
||||||
|
assert "Failed to download bag. status code 500.\n" == output[-1]
|
||||||
|
|
||||||
|
def test_fetch_bag_timeout():
|
||||||
|
with hydroshare_archive() as hydro_path:
|
||||||
|
with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]):
|
||||||
|
hydro = Hydroshare()
|
||||||
|
spec = {
|
||||||
|
"host": {
|
||||||
|
"hostname": [
|
||||||
|
"https://www.hydroshare.org/resource/",
|
||||||
|
"http://www.hydroshare.org/resource/",
|
||||||
|
],
|
||||||
|
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||||
|
},
|
||||||
|
"resource": "123456789",
|
||||||
|
}
|
||||||
|
with TemporaryDirectory() as d:
|
||||||
|
output = []
|
||||||
|
for l in hydro.fetch(spec, d, timeout=0):
|
||||||
|
output.append(l)
|
||||||
|
assert "Bag taking too long to prepare, exiting now, try again later." == output[-1]
|
||||||
|
|
Ładowanie…
Reference in New Issue