From d89f3a66aacecbc55bbc0a13bb1b47f9dfcaca78 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Wed, 25 Sep 2019 09:59:44 -0600 Subject: [PATCH] update hydroshare content provider to doi and add tests --- repo2docker/contentproviders/hydroshare.py | 60 +++---- .../unit/contentproviders/test_hydroshare.py | 156 ++++++++++++++++++ 2 files changed, 175 insertions(+), 41 deletions(-) create mode 100644 tests/unit/contentproviders/test_hydroshare.py diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 15af9b01..60e7d5b6 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -6,42 +6,13 @@ import time from urllib.request import urlopen, Request, urlretrieve from urllib.error import HTTPError -from .base import ContentProvider +from .doi import DoiProvider from ..utils import normalize_doi, is_doi -class Hydroshare(ContentProvider): +class Hydroshare(DoiProvider): """Provide contents of a Hydroshare resource.""" - def _urlopen(self, req, headers=None): - """A urlopen() helper""" - # someone passed a string, not a request - if not isinstance(req, Request): - req = Request(req) - - #req.add_header("User-Agent", "repo2docker {}".format(__version__)) - if headers is not None: - for key, value in headers.items(): - req.add_header(key, value) - - return urlopen(req) - - def _doi2url(self, doi): - # Transform a DOI to a URL - # If not a doi, assume we have a URL and return - if is_doi(doi): - doi = normalize_doi(doi) - - try: - resp = self._urlopen("https://doi.org/{}".format(doi)) - # If the DOI doesn't resolve, just return URL - except HTTPError: - return doi - return resp.url - else: - # Just return what is actulally just a URL - return doi - def detect(self, doi, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Zenodo/Invenio record""" # We need the hostname (url where records are), api url (for metadata), @@ -54,14 +25,17 @@ class Hydroshare(ContentProvider): }, ] - url = self._doi2url(doi) + url = self.doi2url(doi) for host in hosts: if any([url.startswith(s) for s in host["hostname"]]): self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1] return {"resource": self.resource_id, "host": host} - def fetch(self, spec, output_dir, yield_output=False): + def _urlretrieve(bag_url): + return urlretrieve(bag_url) + + def fetch(self, spec, output_dir, yield_output=False, timeout=120): """Fetch and unpack a Hydroshare resource""" resource_id = spec["resource"] host = spec["host"] @@ -71,19 +45,23 @@ class Hydroshare(ContentProvider): bag_url = "{}{}".format(host["django_irods"], resource_id) # bag downloads are prepared on demand and may need some time - conn = urlopen(bag_url) - while conn.info().get_content_type() != "application/zip": - if conn.getcode() != 200: - yield "Failed to download bag. status code {}.\n".format(conn.getcode()) - return + conn = self.urlopen(bag_url) + total_wait_time = 0 + while conn.getcode() == 200 and conn.info().get_content_type() != "application/zip": wait_time = 10 + total_wait_time += wait_time + if total_wait_time > timeout: + yield "Bag taking too long to prepare, exiting now, try again later." + return yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time) time.sleep(wait_time) - conn = urlopen(bag_url) - + conn = self.urlopen(bag_url) + if conn.getcode() != 200: + yield "Failed to download bag. status code {}.\n".format(conn.getcode()) + return # Bag creation seems to need a small time buffer after it says it's ready. time.sleep(1) - filehandle, _ = urlretrieve(bag_url) + filehandle, _ = self._urlretrieve(bag_url) zip_file_object = zipfile.ZipFile(filehandle, 'r') yield "Downloaded, unpacking contents.\n" zip_file_object.extractall("temp") diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py new file mode 100644 index 00000000..a5a64d0a --- /dev/null +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -0,0 +1,156 @@ +import json +import os +import pytest + +from contextlib import contextmanager +from io import BytesIO +from tempfile import TemporaryDirectory, NamedTemporaryFile +from unittest.mock import patch +from urllib.request import urlopen, Request, urlretrieve +from zipfile import ZipFile + +from repo2docker.contentproviders import Hydroshare + + +def test_content_id(): + with patch.object(Hydroshare, "urlopen") as fake_urlopen: + fake_urlopen.return_value.url = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" + hydro = Hydroshare() + + hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") + assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61" + + +test_hosts = [ + ( + [ + "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", + "10.4211/hs.b8f6eae9d89241cf8b5904033460af61", + "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", + ], + { + "host": { + "hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"], + "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + }, + "resource": "b8f6eae9d89241cf8b5904033460af61", + }, + ), +] + +class MockInfo: + def __init__(self, content_type): + self.content_type = content_type + + def get_content_type(self): + return self.content_type + +class MockResponse: + def __init__(self, content_type, status_code): + self.content_type = content_type + self.status_code = status_code + self.mock_info = MockInfo(self.content_type) + + def getcode(self): + return self.status_code + + def info(self): + return self.mock_info + +@pytest.mark.parametrize("test_input,expected", test_hosts) +def test_detect_hydroshare(test_input, expected): + with patch.object(Hydroshare, "urlopen") as fake_urlopen: + fake_urlopen.return_value.url = test_input[0] + # valid Hydroshare DOIs trigger this content provider + assert Hydroshare().detect(test_input[0]) == expected + assert Hydroshare().detect(test_input[1]) == expected + assert Hydroshare().detect(test_input[2]) == expected + # only two of the three calls above have to resolve a DOI + assert fake_urlopen.call_count == 2 + + with patch.object(Hydroshare, "urlopen") as fake_urlopen: + # Don't trigger the Hydroshare content provider + assert Hydroshare().detect("/some/path/here") is None + assert Hydroshare().detect("https://example.com/path/here") is None + # don't handle DOIs that aren't from Hydroshare + fake_urlopen.return_value.url = ( + "http://joss.theoj.org/papers/10.21105/joss.01277" + ) + assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None + +@contextmanager +def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"): + with NamedTemporaryFile(suffix=".zip") as zfile: + with ZipFile(zfile.name, mode="w") as zip: + zip.writestr("{}/some-file.txt".format(prefix), "some content") + zip.writestr("{}/some-other-file.txt".format(prefix), "some more content") + + yield zfile + +def test_fetch_bag(): + # we "fetch" a local ZIP file to simulate a Hydroshare resource + with hydroshare_archive() as hydro_path: + with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200), MockResponse("application/zip", 200)]): + with patch.object(Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]): + hydro = Hydroshare() + hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61" + spec = { + "host": { + "hostname": [ + "https://www.hydroshare.org/resource/", + "http://www.hydroshare.org/resource/", + ], + "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + }, + "resource": "123456789", + } + + with TemporaryDirectory() as d: + output = [] + for l in hydro.fetch(spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + expected = set(["some-other-file.txt", "some-file.txt"]) + assert expected == unpacked_files + +def test_fetch_bag_failure(): + with hydroshare_archive() as hydro_path: + with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]): + hydro = Hydroshare() + spec = { + "host": { + "hostname": [ + "https://www.hydroshare.org/resource/", + "http://www.hydroshare.org/resource/", + ], + "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + }, + "resource": "123456789", + } + with TemporaryDirectory() as d: + output = [] + for l in hydro.fetch(spec, d): + output.append(l) + assert "Failed to download bag. status code 500.\n" == output[-1] + +def test_fetch_bag_timeout(): + with hydroshare_archive() as hydro_path: + with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]): + hydro = Hydroshare() + spec = { + "host": { + "hostname": [ + "https://www.hydroshare.org/resource/", + "http://www.hydroshare.org/resource/", + ], + "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + }, + "resource": "123456789", + } + with TemporaryDirectory() as d: + output = [] + for l in hydro.fetch(spec, d, timeout=0): + output.append(l) + assert "Bag taking too long to prepare, exiting now, try again later." == output[-1] +