update hydroshare content provider to doi and add tests

2019-09-25 09:59:44 -06:00 · 2019-09-25 09:59:44 -06:00 · d89f3a66aa
commit d89f3a66aa
--- a/repo2docker/contentproviders/hydroshare.py
+++ b/repo2docker/contentproviders/hydroshare.py
@ -6,42 +6,13 @@ import time
 from urllib.request import urlopen, Request, urlretrieve
 from urllib.error import HTTPError
-from .base import ContentProvider
+from .doi import DoiProvider
 from ..utils import normalize_doi, is_doi
-class Hydroshare(ContentProvider):
+class Hydroshare(DoiProvider):
    """Provide contents of a Hydroshare resource."""
    def _urlopen(self, req, headers=None):
        """A urlopen() helper"""
        # someone passed a string, not a request
        if not isinstance(req, Request):
            req = Request(req)
        #req.add_header("User-Agent", "repo2docker {}".format(__version__))
        if headers is not None:
            for key, value in headers.items():
                req.add_header(key, value)
        return urlopen(req)
    def _doi2url(self, doi):
        # Transform a DOI to a URL
        # If not a doi, assume we have a URL and return
        if is_doi(doi):
            doi = normalize_doi(doi)
            try:
                resp = self._urlopen("https://doi.org/{}".format(doi))
            # If the DOI doesn't resolve, just return URL
            except HTTPError:
                return doi
            return resp.url
        else:
            # Just return what is actulally just a URL
            return doi
    def detect(self, doi, ref=None, extra_args=None):
        """Trigger this provider for things that resolve to a Zenodo/Invenio record"""
        # We need the hostname (url where records are), api url (for metadata),
@ -54,14 +25,17 @@ class Hydroshare(ContentProvider):
            },
        ]
-        url = self._doi2url(doi)
+        url = self.doi2url(doi)
        for host in hosts:
            if any([url.startswith(s) for s in host["hostname"]]):
                self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
                return {"resource": self.resource_id, "host": host}
-    def fetch(self, spec, output_dir, yield_output=False):
+    def _urlretrieve(bag_url):
        return urlretrieve(bag_url)
    def fetch(self, spec, output_dir, yield_output=False, timeout=120):
        """Fetch and unpack a Hydroshare resource"""
        resource_id = spec["resource"]
        host = spec["host"]
@ -71,19 +45,23 @@ class Hydroshare(ContentProvider):
        bag_url = "{}{}".format(host["django_irods"], resource_id)
        # bag downloads are prepared on demand and may need some time
-        conn = urlopen(bag_url)
+        conn = self.urlopen(bag_url)
-        while conn.info().get_content_type() != "application/zip":
+        total_wait_time = 0
-            if conn.getcode() != 200:
+        while conn.getcode() == 200 and conn.info().get_content_type() != "application/zip":
                yield "Failed to download bag. status code {}.\n".format(conn.getcode())
                return
            wait_time = 10
            total_wait_time += wait_time
            if total_wait_time > timeout:
                yield "Bag taking too long to prepare, exiting now, try again later."
                return
            yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time)
            time.sleep(wait_time)
-            conn = urlopen(bag_url)
+            conn = self.urlopen(bag_url)
-
+        if conn.getcode() != 200:
            yield "Failed to download bag. status code {}.\n".format(conn.getcode())
            return
        # Bag creation seems to need a small time buffer after it says it's ready.
        time.sleep(1)
-        filehandle, _ = urlretrieve(bag_url)
+        filehandle, _ = self._urlretrieve(bag_url)
        zip_file_object = zipfile.ZipFile(filehandle, 'r')
        yield "Downloaded, unpacking contents.\n"
        zip_file_object.extractall("temp")
--- a/tests/unit/contentproviders/test_hydroshare.py
+++ b/tests/unit/contentproviders/test_hydroshare.py
@ -0,0 +1,156 @@
 import json
 import os
 import pytest
 from contextlib import contextmanager
 from io import BytesIO
 from tempfile import TemporaryDirectory, NamedTemporaryFile
 from unittest.mock import patch
 from urllib.request import urlopen, Request, urlretrieve
 from zipfile import ZipFile
 from repo2docker.contentproviders import Hydroshare
 def test_content_id():
    with patch.object(Hydroshare, "urlopen") as fake_urlopen:
        fake_urlopen.return_value.url = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
        hydro = Hydroshare()
        hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
        assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61"
 test_hosts = [
    (
        [
            "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
            "10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
            "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
        ],
        {
            "host": {
                "hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"],
                "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
            },
            "resource": "b8f6eae9d89241cf8b5904033460af61",
        },
    ),
 ]
 class MockInfo:
    def __init__(self, content_type):
        self.content_type = content_type
    def get_content_type(self):
        return self.content_type
 class MockResponse:
    def __init__(self, content_type, status_code):
        self.content_type = content_type
        self.status_code = status_code
        self.mock_info = MockInfo(self.content_type)
    def getcode(self):
        return self.status_code
    def info(self):
        return self.mock_info
@pytest.mark.parametrize("test_input,expected", test_hosts)
 def test_detect_hydroshare(test_input, expected):
    with patch.object(Hydroshare, "urlopen") as fake_urlopen:
        fake_urlopen.return_value.url = test_input[0]
        # valid Hydroshare DOIs trigger this content provider
        assert Hydroshare().detect(test_input[0]) == expected
        assert Hydroshare().detect(test_input[1]) == expected
        assert Hydroshare().detect(test_input[2]) == expected
        # only two of the three calls above have to resolve a DOI
        assert fake_urlopen.call_count == 2
    with patch.object(Hydroshare, "urlopen") as fake_urlopen:
        # Don't trigger the Hydroshare content provider
        assert Hydroshare().detect("/some/path/here") is None
        assert Hydroshare().detect("https://example.com/path/here") is None
        # don't handle DOIs that aren't from Hydroshare
        fake_urlopen.return_value.url = (
            "http://joss.theoj.org/papers/10.21105/joss.01277"
        )
        assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
@contextmanager
 def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"):
    with NamedTemporaryFile(suffix=".zip") as zfile:
        with ZipFile(zfile.name, mode="w") as zip:
            zip.writestr("{}/some-file.txt".format(prefix), "some content")
            zip.writestr("{}/some-other-file.txt".format(prefix), "some more content")
        yield zfile
 def test_fetch_bag():
    # we "fetch" a local ZIP file to simulate a Hydroshare resource
    with hydroshare_archive() as hydro_path:
        with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200), MockResponse("application/zip", 200)]):
            with patch.object(Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]):
                hydro = Hydroshare()
                hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61"
                spec = {
                    "host": {
                        "hostname": [
                            "https://www.hydroshare.org/resource/",
                            "http://www.hydroshare.org/resource/",
                        ],
                        "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
                    },
                    "resource": "123456789",
                }
                with TemporaryDirectory() as d:
                    output = []
                    for l in hydro.fetch(spec, d):
                        output.append(l)
                    unpacked_files = set(os.listdir(d))
                    expected = set(["some-other-file.txt", "some-file.txt"])
                    assert expected == unpacked_files
 def test_fetch_bag_failure():
    with hydroshare_archive() as hydro_path:
        with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]):
            hydro = Hydroshare()
            spec = {
                "host": {
                    "hostname": [
                        "https://www.hydroshare.org/resource/",
                        "http://www.hydroshare.org/resource/",
                    ],
                    "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
                },
                "resource": "123456789",
            }
            with TemporaryDirectory() as d:
                output = []
                for l in hydro.fetch(spec, d):
                    output.append(l)
                assert "Failed to download bag. status code 500.\n" == output[-1]
 def test_fetch_bag_timeout():
    with hydroshare_archive() as hydro_path:
        with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]):
            hydro = Hydroshare()
            spec = {
                "host": {
                    "hostname": [
                        "https://www.hydroshare.org/resource/",
                        "http://www.hydroshare.org/resource/",
                    ],
                    "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
                },
                "resource": "123456789",
            }
            with TemporaryDirectory() as d:
                output = []
                for l in hydro.fetch(spec, d, timeout=0):
                    output.append(l)
                assert "Bag taking too long to prepare, exiting now, try again later." == output[-1]