update hydroshare content provider to doi and add tests

2019-09-25 09:59:44 -06:00 · 2019-09-25 09:59:44 -06:00 · d89f3a66aa
commit d89f3a66aa
--- a/repo2docker/contentproviders/hydroshare.py
+++ b/repo2docker/contentproviders/hydroshare.py
@ -6,42 +6,13 @@ import time
 from urllib.request import urlopen, Request, urlretrieve
 from urllib.error import HTTPError

-from .base import ContentProvider
+from .doi import DoiProvider
 from ..utils import normalize_doi, is_doi


-class Hydroshare(ContentProvider):
+class Hydroshare(DoiProvider):
    """Provide contents of a Hydroshare resource."""

-    def _urlopen(self, req, headers=None):
-        """A urlopen() helper"""
-        # someone passed a string, not a request
-        if not isinstance(req, Request):
-            req = Request(req)
-
-        #req.add_header("User-Agent", "repo2docker {}".format(__version__))
-        if headers is not None:
-            for key, value in headers.items():
-                req.add_header(key, value)
-
-        return urlopen(req)
-
-    def _doi2url(self, doi):
-        # Transform a DOI to a URL
-        # If not a doi, assume we have a URL and return
-        if is_doi(doi):
-            doi = normalize_doi(doi)
-
-            try:
-                resp = self._urlopen("https://doi.org/{}".format(doi))
-            # If the DOI doesn't resolve, just return URL
-            except HTTPError:
-                return doi
-            return resp.url
-        else:
-            # Just return what is actulally just a URL
-            return doi
-
    def detect(self, doi, ref=None, extra_args=None):
        """Trigger this provider for things that resolve to a Zenodo/Invenio record"""
        # We need the hostname (url where records are), api url (for metadata),
@ -54,14 +25,17 @@ class Hydroshare(ContentProvider):
            },
        ]

-        url = self._doi2url(doi)
+        url = self.doi2url(doi)

        for host in hosts:
            if any([url.startswith(s) for s in host["hostname"]]):
                self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
                return {"resource": self.resource_id, "host": host}

-    def fetch(self, spec, output_dir, yield_output=False):
+    def _urlretrieve(bag_url):
+        return urlretrieve(bag_url)
+
+    def fetch(self, spec, output_dir, yield_output=False, timeout=120):
        """Fetch and unpack a Hydroshare resource"""
        resource_id = spec["resource"]
        host = spec["host"]
@ -71,19 +45,23 @@ class Hydroshare(ContentProvider):
        bag_url = "{}{}".format(host["django_irods"], resource_id)

        # bag downloads are prepared on demand and may need some time
-        conn = urlopen(bag_url)
-        while conn.info().get_content_type() != "application/zip":
-            if conn.getcode() != 200:
-                yield "Failed to download bag. status code {}.\n".format(conn.getcode())
-                return
+        conn = self.urlopen(bag_url)
+        total_wait_time = 0
+        while conn.getcode() == 200 and conn.info().get_content_type() != "application/zip":
            wait_time = 10
+            total_wait_time += wait_time
+            if total_wait_time > timeout:
+                yield "Bag taking too long to prepare, exiting now, try again later."
+                return
            yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time)
            time.sleep(wait_time)
-            conn = urlopen(bag_url)
-
+            conn = self.urlopen(bag_url)
+        if conn.getcode() != 200:
+            yield "Failed to download bag. status code {}.\n".format(conn.getcode())
+            return
        # Bag creation seems to need a small time buffer after it says it's ready.
        time.sleep(1)
-        filehandle, _ = urlretrieve(bag_url)
+        filehandle, _ = self._urlretrieve(bag_url)
        zip_file_object = zipfile.ZipFile(filehandle, 'r')
        yield "Downloaded, unpacking contents.\n"
        zip_file_object.extractall("temp")
--- a/tests/unit/contentproviders/test_hydroshare.py
+++ b/tests/unit/contentproviders/test_hydroshare.py
@ -0,0 +1,156 @@
+import json
+import os
+import pytest
+
+from contextlib import contextmanager
+from io import BytesIO
+from tempfile import TemporaryDirectory, NamedTemporaryFile
+from unittest.mock import patch
+from urllib.request import urlopen, Request, urlretrieve
+from zipfile import ZipFile
+
+from repo2docker.contentproviders import Hydroshare
+
+
+def test_content_id():
+    with patch.object(Hydroshare, "urlopen") as fake_urlopen:
+        fake_urlopen.return_value.url = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
+        hydro = Hydroshare()
+
+        hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
+        assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61"
+
+
+test_hosts = [
+    (
+        [
+            "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61",
+            "10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
+            "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61",
+        ],
+        {
+            "host": {
+                "hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"],
+                "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
+            },
+            "resource": "b8f6eae9d89241cf8b5904033460af61",
+        },
+    ),
+]
+
+class MockInfo:
+    def __init__(self, content_type):
+        self.content_type = content_type
+
+    def get_content_type(self):
+        return self.content_type
+  
+class MockResponse:
+    def __init__(self, content_type, status_code):
+        self.content_type = content_type
+        self.status_code = status_code
+        self.mock_info = MockInfo(self.content_type)
+  
+    def getcode(self):
+        return self.status_code
+  
+    def info(self):
+        return self.mock_info
+
+@pytest.mark.parametrize("test_input,expected", test_hosts)
+def test_detect_hydroshare(test_input, expected):
+    with patch.object(Hydroshare, "urlopen") as fake_urlopen:
+        fake_urlopen.return_value.url = test_input[0]
+        # valid Hydroshare DOIs trigger this content provider
+        assert Hydroshare().detect(test_input[0]) == expected
+        assert Hydroshare().detect(test_input[1]) == expected
+        assert Hydroshare().detect(test_input[2]) == expected
+        # only two of the three calls above have to resolve a DOI
+        assert fake_urlopen.call_count == 2
+
+    with patch.object(Hydroshare, "urlopen") as fake_urlopen:
+        # Don't trigger the Hydroshare content provider
+        assert Hydroshare().detect("/some/path/here") is None
+        assert Hydroshare().detect("https://example.com/path/here") is None
+        # don't handle DOIs that aren't from Hydroshare
+        fake_urlopen.return_value.url = (
+            "http://joss.theoj.org/papers/10.21105/joss.01277"
+        )
+        assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
+
+@contextmanager
+def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"):
+    with NamedTemporaryFile(suffix=".zip") as zfile:
+        with ZipFile(zfile.name, mode="w") as zip:
+            zip.writestr("{}/some-file.txt".format(prefix), "some content")
+            zip.writestr("{}/some-other-file.txt".format(prefix), "some more content")
+
+        yield zfile
+
+def test_fetch_bag():
+    # we "fetch" a local ZIP file to simulate a Hydroshare resource
+    with hydroshare_archive() as hydro_path:
+        with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200), MockResponse("application/zip", 200)]):
+            with patch.object(Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]):
+                hydro = Hydroshare()
+                hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61"
+                spec = {
+                    "host": {
+                        "hostname": [
+                            "https://www.hydroshare.org/resource/",
+                            "http://www.hydroshare.org/resource/",
+                        ],
+                        "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
+                    },
+                    "resource": "123456789",
+                }
+    
+                with TemporaryDirectory() as d:
+                    output = []
+                    for l in hydro.fetch(spec, d):
+                        output.append(l)
+    
+                    unpacked_files = set(os.listdir(d))
+                    expected = set(["some-other-file.txt", "some-file.txt"])
+                    assert expected == unpacked_files
+
+def test_fetch_bag_failure():
+    with hydroshare_archive() as hydro_path:
+        with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]):
+            hydro = Hydroshare()
+            spec = {
+                "host": {
+                    "hostname": [
+                        "https://www.hydroshare.org/resource/",
+                        "http://www.hydroshare.org/resource/",
+                    ],
+                    "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
+                },
+                "resource": "123456789",
+            }
+            with TemporaryDirectory() as d:
+                output = []
+                for l in hydro.fetch(spec, d):
+                    output.append(l)
+                assert "Failed to download bag. status code 500.\n" == output[-1]
+
+def test_fetch_bag_timeout():
+    with hydroshare_archive() as hydro_path:
+        with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]):
+            hydro = Hydroshare()
+            spec = {
+                "host": {
+                    "hostname": [
+                        "https://www.hydroshare.org/resource/",
+                        "http://www.hydroshare.org/resource/",
+                    ],
+                    "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
+                },
+                "resource": "123456789",
+            }
+            with TemporaryDirectory() as d:
+                output = []
+                for l in hydro.fetch(spec, d, timeout=0):
+                    output.append(l)
+                assert "Bag taking too long to prepare, exiting now, try again later." == output[-1]
+