update hydroshare content provider to doi and add tests

pull/800/head
Scott Black 2019-09-25 09:59:44 -06:00
rodzic 0549577a51
commit d89f3a66aa
2 zmienionych plików z 175 dodań i 41 usunięć

Wyświetl plik

@ -6,42 +6,13 @@ import time
from urllib.request import urlopen, Request, urlretrieve
from urllib.error import HTTPError
from .base import ContentProvider
from .doi import DoiProvider
from ..utils import normalize_doi, is_doi
class Hydroshare(ContentProvider):
class Hydroshare(DoiProvider):
"""Provide contents of a Hydroshare resource."""
def _urlopen(self, req, headers=None):
"""A urlopen() helper"""
# someone passed a string, not a request
if not isinstance(req, Request):
req = Request(req)
#req.add_header("User-Agent", "repo2docker {}".format(__version__))
if headers is not None:
for key, value in headers.items():
req.add_header(key, value)
return urlopen(req)
def _doi2url(self, doi):
# Transform a DOI to a URL
# If not a doi, assume we have a URL and return
if is_doi(doi):
doi = normalize_doi(doi)
try:
resp = self._urlopen("https://doi.org/{}".format(doi))
# If the DOI doesn't resolve, just return URL
except HTTPError:
return doi
return resp.url
else:
# Just return what is actulally just a URL
return doi
def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
# We need the hostname (url where records are), api url (for metadata),
@ -54,14 +25,17 @@ class Hydroshare(ContentProvider):
},
]
url = self._doi2url(doi)
url = self.doi2url(doi)
for host in hosts:
if any([url.startswith(s) for s in host["hostname"]]):
self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
return {"resource": self.resource_id, "host": host}
def fetch(self, spec, output_dir, yield_output=False):
def _urlretrieve(bag_url):
return urlretrieve(bag_url)
def fetch(self, spec, output_dir, yield_output=False, timeout=120):
"""Fetch and unpack a Hydroshare resource"""
resource_id = spec["resource"]
host = spec["host"]
@ -71,19 +45,23 @@ class Hydroshare(ContentProvider):
bag_url = "{}{}".format(host["django_irods"], resource_id)
# bag downloads are prepared on demand and may need some time
conn = urlopen(bag_url)
while conn.info().get_content_type() != "application/zip":
if conn.getcode() != 200:
yield "Failed to download bag. status code {}.\n".format(conn.getcode())
return
conn = self.urlopen(bag_url)
total_wait_time = 0
while conn.getcode() == 200 and conn.info().get_content_type() != "application/zip":
wait_time = 10
total_wait_time += wait_time
if total_wait_time > timeout:
yield "Bag taking too long to prepare, exiting now, try again later."
return
yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time)
time.sleep(wait_time)
conn = urlopen(bag_url)
conn = self.urlopen(bag_url)
if conn.getcode() != 200:
yield "Failed to download bag. status code {}.\n".format(conn.getcode())
return
# Bag creation seems to need a small time buffer after it says it's ready.
time.sleep(1)
filehandle, _ = urlretrieve(bag_url)
filehandle, _ = self._urlretrieve(bag_url)
zip_file_object = zipfile.ZipFile(filehandle, 'r')
yield "Downloaded, unpacking contents.\n"
zip_file_object.extractall("temp")

Wyświetl plik

@ -0,0 +1,156 @@
import json
import os
import pytest
from contextlib import contextmanager
from io import BytesIO
from tempfile import TemporaryDirectory, NamedTemporaryFile
from unittest.mock import patch
from urllib.request import urlopen, Request, urlretrieve
from zipfile import ZipFile
from repo2docker.contentproviders import Hydroshare
def test_content_id():
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
hydro = Hydroshare()
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61"
test_hosts = [
(
[
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
"10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
],
{
"host": {
"hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
},
"resource": "b8f6eae9d89241cf8b5904033460af61",
},
),
]
class MockInfo:
def __init__(self, content_type):
self.content_type = content_type
def get_content_type(self):
return self.content_type
class MockResponse:
def __init__(self, content_type, status_code):
self.content_type = content_type
self.status_code = status_code
self.mock_info = MockInfo(self.content_type)
def getcode(self):
return self.status_code
def info(self):
return self.mock_info
@pytest.mark.parametrize("test_input,expected", test_hosts)
def test_detect_hydroshare(test_input, expected):
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = test_input[0]
# valid Hydroshare DOIs trigger this content provider
assert Hydroshare().detect(test_input[0]) == expected
assert Hydroshare().detect(test_input[1]) == expected
assert Hydroshare().detect(test_input[2]) == expected
# only two of the three calls above have to resolve a DOI
assert fake_urlopen.call_count == 2
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
# Don't trigger the Hydroshare content provider
assert Hydroshare().detect("/some/path/here") is None
assert Hydroshare().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Hydroshare
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
@contextmanager
def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"):
with NamedTemporaryFile(suffix=".zip") as zfile:
with ZipFile(zfile.name, mode="w") as zip:
zip.writestr("{}/some-file.txt".format(prefix), "some content")
zip.writestr("{}/some-other-file.txt".format(prefix), "some more content")
yield zfile
def test_fetch_bag():
# we "fetch" a local ZIP file to simulate a Hydroshare resource
with hydroshare_archive() as hydro_path:
with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200), MockResponse("application/zip", 200)]):
with patch.object(Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]):
hydro = Hydroshare()
hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61"
spec = {
"host": {
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
},
"resource": "123456789",
}
with TemporaryDirectory() as d:
output = []
for l in hydro.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
assert expected == unpacked_files
def test_fetch_bag_failure():
with hydroshare_archive() as hydro_path:
with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]):
hydro = Hydroshare()
spec = {
"host": {
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
},
"resource": "123456789",
}
with TemporaryDirectory() as d:
output = []
for l in hydro.fetch(spec, d):
output.append(l)
assert "Failed to download bag. status code 500.\n" == output[-1]
def test_fetch_bag_timeout():
with hydroshare_archive() as hydro_path:
with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]):
hydro = Hydroshare()
spec = {
"host": {
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
},
"resource": "123456789",
}
with TemporaryDirectory() as d:
output = []
for l in hydro.fetch(spec, d, timeout=0):
output.append(l)
assert "Bag taking too long to prepare, exiting now, try again later." == output[-1]