From 5f574d3a2ceb7ca406719145a6fa761d16cee739 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Thu, 22 Aug 2019 16:30:48 -0600 Subject: [PATCH 01/24] initial checkin of hydroshare content provider --- repo2docker/contentproviders/__init__.py | 1 + repo2docker/contentproviders/hydroshare.py | 85 ++++++++++++++++++++++ 2 files changed, 86 insertions(+) mode change 100644 => 100755 repo2docker/contentproviders/__init__.py create mode 100755 repo2docker/contentproviders/hydroshare.py diff --git a/repo2docker/contentproviders/__init__.py b/repo2docker/contentproviders/__init__.py old mode 100644 new mode 100755 index d648731d..36356377 --- a/repo2docker/contentproviders/__init__.py +++ b/repo2docker/contentproviders/__init__.py @@ -1,3 +1,4 @@ from .git import Git from .base import Local from .zenodo import Zenodo +from .hydroshare import Hydroshare diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py new file mode 100755 index 00000000..c20cff71 --- /dev/null +++ b/repo2docker/contentproviders/hydroshare.py @@ -0,0 +1,85 @@ +import os +import json +import shutil +import urllib +import zipfile + +from os import makedirs +from os import path +from urllib.request import urlopen, Request +from urllib.error import HTTPError +from zipfile import ZipFile, is_zipfile + +from .base import ContentProvider +from ..utils import copytree, deep_get +from ..utils import normalize_doi, is_doi +from .. import __version__ + + +class Hydroshare(ContentProvider): + """Provide contents of a Hydroshare resource.""" + + def _urlopen(self, req, headers=None): + """A urlopen() helper""" + # someone passed a string, not a request + if not isinstance(req, Request): + req = Request(req) + + #req.add_header("User-Agent", "repo2docker {}".format(__version__)) + if headers is not None: + for key, value in headers.items(): + req.add_header(key, value) + + return urlopen(req) + + def _doi2url(self, doi): + # Transform a DOI to a URL + # If not a doi, assume we have a URL and return + if is_doi(doi): + doi = normalize_doi(doi) + + try: + resp = self._urlopen("https://doi.org/{}".format(doi)) + # If the DOI doesn't resolve, just return URL + except HTTPError: + return doi + return resp.url + else: + # Just return what is actulally just a URL + return doi + + def detect(self, doi, ref=None, extra_args=None): + """Trigger this provider for things that resolve to a Zenodo/Invenio record""" + # We need the hostname (url where records are), api url (for metadata), + # filepath (path to files in metadata), filename (path to filename in + # metadata), download (path to file download URL), and type (path to item type in metadata) + hosts = [ + { + "hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"], + "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + }, + ] + + url = self._doi2url(doi) + + for host in hosts: + if any([url.startswith(s) for s in host["hostname"]]): + self.resource_id = url.rsplit("/", maxsplit=1)[1] + return {"resource": self.resource_id, "host": host} + + def fetch(self, spec, output_dir, yield_output=False): + """Fetch and unpack a Hydroshare resource""" + resource_id = spec["resource"] + host = spec["host"] + + yield "Fetching HydroShare Resource {}.\n".format(resource_id) + + bag_url = "{}{}".format(host["django_irods"], resource_id) + filehandle, _ = urllib.urlretrieve(bag_url) + zip_file_object = zipfile.ZipFile(filehandle, 'r') + zip_file_object.extractall(output_dir) + + @property + def content_id(self): + """The HydroShare resource ID""" + return self.resource_id From e04bc18248cdc6bf72d063d712e3bb52a50255ee Mon Sep 17 00:00:00 2001 From: Scott Black Date: Thu, 22 Aug 2019 16:33:25 -0600 Subject: [PATCH 02/24] remove unused imports --- repo2docker/contentproviders/hydroshare.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index c20cff71..00d6c069 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -1,19 +1,11 @@ -import os -import json -import shutil import urllib import zipfile -from os import makedirs -from os import path from urllib.request import urlopen, Request from urllib.error import HTTPError -from zipfile import ZipFile, is_zipfile from .base import ContentProvider -from ..utils import copytree, deep_get from ..utils import normalize_doi, is_doi -from .. import __version__ class Hydroshare(ContentProvider): From fc9326cbf3bbce7d7a795cf5788a80892f06989e Mon Sep 17 00:00:00 2001 From: Scott Black Date: Thu, 22 Aug 2019 16:39:42 -0600 Subject: [PATCH 03/24] add hydroshare as a content provider --- repo2docker/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 repo2docker/app.py diff --git a/repo2docker/app.py b/repo2docker/app.py old mode 100644 new mode 100755 index e3fc79ee..ff8e151a --- a/repo2docker/app.py +++ b/repo2docker/app.py @@ -142,7 +142,7 @@ class Repo2Docker(Application): # detecting if something will successfully `git clone` is very hard if all # you can do is look at the path/URL to it. content_providers = List( - [contentproviders.Local, contentproviders.Zenodo, contentproviders.Git], + [contentproviders.Local, contentproviders.Zenodo, contentproviders.Hydroshare, contentproviders.Git], config=True, help=""" Ordered list by priority of ContentProviders to try in turn to fetch From 316b1a0f3437bc60912c1dd19fd35091b881f853 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Thu, 22 Aug 2019 16:41:31 -0600 Subject: [PATCH 04/24] use request.urlretrieve --- repo2docker/contentproviders/hydroshare.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 00d6c069..5c7bd6e0 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -1,7 +1,6 @@ -import urllib import zipfile -from urllib.request import urlopen, Request +from urllib.request import urlopen, Request, urlretrieve from urllib.error import HTTPError from .base import ContentProvider @@ -67,7 +66,7 @@ class Hydroshare(ContentProvider): yield "Fetching HydroShare Resource {}.\n".format(resource_id) bag_url = "{}{}".format(host["django_irods"], resource_id) - filehandle, _ = urllib.urlretrieve(bag_url) + filehandle, _ = urlretrieve(bag_url) zip_file_object = zipfile.ZipFile(filehandle, 'r') zip_file_object.extractall(output_dir) From fdb92525c6881747ce5d301c7a8b790622a0843a Mon Sep 17 00:00:00 2001 From: Scott Black Date: Thu, 22 Aug 2019 17:00:55 -0600 Subject: [PATCH 05/24] only use data/contents directory --- repo2docker/contentproviders/hydroshare.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 5c7bd6e0..35284087 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -1,4 +1,6 @@ import zipfile +import os +import shutil from urllib.request import urlopen, Request, urlretrieve from urllib.error import HTTPError @@ -55,7 +57,7 @@ class Hydroshare(ContentProvider): for host in hosts: if any([url.startswith(s) for s in host["hostname"]]): - self.resource_id = url.rsplit("/", maxsplit=1)[1] + self.resource_id = url.rsplit("/", maxsplit=2)[1] return {"resource": self.resource_id, "host": host} def fetch(self, spec, output_dir, yield_output=False): @@ -68,7 +70,12 @@ class Hydroshare(ContentProvider): bag_url = "{}{}".format(host["django_irods"], resource_id) filehandle, _ = urlretrieve(bag_url) zip_file_object = zipfile.ZipFile(filehandle, 'r') - zip_file_object.extractall(output_dir) + + zip_file_object.extractall("temp") + files = os.listdir(os.path.join("temp", self.resource_id, "data", "contents")) + for f in files: + shutil.move("temp" + f, output_dir) + shutil.rmtree("temp") @property def content_id(self): From 9927faaafd5126ec502d030243dea321f3eb4625 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Thu, 22 Aug 2019 17:10:55 -0600 Subject: [PATCH 06/24] extract from contents directory --- repo2docker/contentproviders/hydroshare.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 35284087..f32e5086 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -72,9 +72,10 @@ class Hydroshare(ContentProvider): zip_file_object = zipfile.ZipFile(filehandle, 'r') zip_file_object.extractall("temp") - files = os.listdir(os.path.join("temp", self.resource_id, "data", "contents")) + contents_dir = os.path.join("temp", self.resource_id, "data", "contents") + files = os.listdir(contents_dir) for f in files: - shutil.move("temp" + f, output_dir) + shutil.move(os.path.join(contents_dir, f), output_dir) shutil.rmtree("temp") @property From 8817fec7ac10273c39bd9e0ead076110eaccdddd Mon Sep 17 00:00:00 2001 From: Scott Black Date: Thu, 22 Aug 2019 17:24:52 -0600 Subject: [PATCH 07/24] handle traliing slash in url --- repo2docker/contentproviders/hydroshare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index f32e5086..5c1c5e31 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -57,7 +57,7 @@ class Hydroshare(ContentProvider): for host in hosts: if any([url.startswith(s) for s in host["hostname"]]): - self.resource_id = url.rsplit("/", maxsplit=2)[1] + self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1] return {"resource": self.resource_id, "host": host} def fetch(self, spec, output_dir, yield_output=False): From 97eee3b8c0d542b4016aa359856e44f6019bba0a Mon Sep 17 00:00:00 2001 From: Scott Black Date: Fri, 23 Aug 2019 10:56:52 -0600 Subject: [PATCH 08/24] allow for bag creation --- repo2docker/contentproviders/hydroshare.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 5c1c5e31..c26fadd1 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -1,6 +1,7 @@ import zipfile import os import shutil +import time from urllib.request import urlopen, Request, urlretrieve from urllib.error import HTTPError @@ -68,10 +69,21 @@ class Hydroshare(ContentProvider): yield "Fetching HydroShare Resource {}.\n".format(resource_id) bag_url = "{}{}".format(host["django_irods"], resource_id) + + # bag downloads are prepared on demand and may need some time + conn = urlopen(bag_url) + while str(conn.info().get_content_type()) != "application/zip": + if conn.getcode() != 200: + yield "Failed to download bag. status code {}".format(conn.getcode()) + return + yield "Bag is being prepared, requesting again in 3 seconds" + time.sleep(3) + filehandle, _ = urlretrieve(bag_url) zip_file_object = zipfile.ZipFile(filehandle, 'r') zip_file_object.extractall("temp") + # resources store the contents in the data/contents directory, which is all we want to keep contents_dir = os.path.join("temp", self.resource_id, "data", "contents") files = os.listdir(contents_dir) for f in files: From 970c7ece201441348c249c25dfc0fe5c1d32443a Mon Sep 17 00:00:00 2001 From: Scott Black Date: Fri, 23 Aug 2019 11:06:02 -0600 Subject: [PATCH 09/24] actually reconnect --- repo2docker/contentproviders/hydroshare.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index c26fadd1..bd4a69e5 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -72,12 +72,13 @@ class Hydroshare(ContentProvider): # bag downloads are prepared on demand and may need some time conn = urlopen(bag_url) - while str(conn.info().get_content_type()) != "application/zip": + while conn.info().get_content_type() != "application/zip": if conn.getcode() != 200: yield "Failed to download bag. status code {}".format(conn.getcode()) return yield "Bag is being prepared, requesting again in 3 seconds" time.sleep(3) + conn = urlopen(bag_url) filehandle, _ = urlretrieve(bag_url) zip_file_object = zipfile.ZipFile(filehandle, 'r') From b068f8d9c1cdef904de8455162e83e4bfd17d706 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Fri, 23 Aug 2019 11:09:26 -0600 Subject: [PATCH 10/24] extend wait time and cleanup messaging --- repo2docker/contentproviders/hydroshare.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index bd4a69e5..112ef2a7 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -74,10 +74,11 @@ class Hydroshare(ContentProvider): conn = urlopen(bag_url) while conn.info().get_content_type() != "application/zip": if conn.getcode() != 200: - yield "Failed to download bag. status code {}".format(conn.getcode()) + yield "Failed to download bag. status code {}.\n".format(conn.getcode()) return - yield "Bag is being prepared, requesting again in 3 seconds" - time.sleep(3) + wait_time = 10 + yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time) + time.sleep(wait_time) conn = urlopen(bag_url) filehandle, _ = urlretrieve(bag_url) From ebbc9e0233dcf70092a3ccfba2b1c7017a1fae4c Mon Sep 17 00:00:00 2001 From: Scott Black Date: Fri, 23 Aug 2019 11:16:39 -0600 Subject: [PATCH 11/24] update messaging --- repo2docker/contentproviders/hydroshare.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 112ef2a7..15af9b01 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -81,15 +81,18 @@ class Hydroshare(ContentProvider): time.sleep(wait_time) conn = urlopen(bag_url) + # Bag creation seems to need a small time buffer after it says it's ready. + time.sleep(1) filehandle, _ = urlretrieve(bag_url) zip_file_object = zipfile.ZipFile(filehandle, 'r') - + yield "Downloaded, unpacking contents.\n" zip_file_object.extractall("temp") # resources store the contents in the data/contents directory, which is all we want to keep contents_dir = os.path.join("temp", self.resource_id, "data", "contents") files = os.listdir(contents_dir) for f in files: shutil.move(os.path.join(contents_dir, f), output_dir) + yield "Finished, cleaning up.\n" shutil.rmtree("temp") @property From d89f3a66aacecbc55bbc0a13bb1b47f9dfcaca78 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Wed, 25 Sep 2019 09:59:44 -0600 Subject: [PATCH 12/24] update hydroshare content provider to doi and add tests --- repo2docker/contentproviders/hydroshare.py | 60 +++---- .../unit/contentproviders/test_hydroshare.py | 156 ++++++++++++++++++ 2 files changed, 175 insertions(+), 41 deletions(-) create mode 100644 tests/unit/contentproviders/test_hydroshare.py diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 15af9b01..60e7d5b6 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -6,42 +6,13 @@ import time from urllib.request import urlopen, Request, urlretrieve from urllib.error import HTTPError -from .base import ContentProvider +from .doi import DoiProvider from ..utils import normalize_doi, is_doi -class Hydroshare(ContentProvider): +class Hydroshare(DoiProvider): """Provide contents of a Hydroshare resource.""" - def _urlopen(self, req, headers=None): - """A urlopen() helper""" - # someone passed a string, not a request - if not isinstance(req, Request): - req = Request(req) - - #req.add_header("User-Agent", "repo2docker {}".format(__version__)) - if headers is not None: - for key, value in headers.items(): - req.add_header(key, value) - - return urlopen(req) - - def _doi2url(self, doi): - # Transform a DOI to a URL - # If not a doi, assume we have a URL and return - if is_doi(doi): - doi = normalize_doi(doi) - - try: - resp = self._urlopen("https://doi.org/{}".format(doi)) - # If the DOI doesn't resolve, just return URL - except HTTPError: - return doi - return resp.url - else: - # Just return what is actulally just a URL - return doi - def detect(self, doi, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Zenodo/Invenio record""" # We need the hostname (url where records are), api url (for metadata), @@ -54,14 +25,17 @@ class Hydroshare(ContentProvider): }, ] - url = self._doi2url(doi) + url = self.doi2url(doi) for host in hosts: if any([url.startswith(s) for s in host["hostname"]]): self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1] return {"resource": self.resource_id, "host": host} - def fetch(self, spec, output_dir, yield_output=False): + def _urlretrieve(bag_url): + return urlretrieve(bag_url) + + def fetch(self, spec, output_dir, yield_output=False, timeout=120): """Fetch and unpack a Hydroshare resource""" resource_id = spec["resource"] host = spec["host"] @@ -71,19 +45,23 @@ class Hydroshare(ContentProvider): bag_url = "{}{}".format(host["django_irods"], resource_id) # bag downloads are prepared on demand and may need some time - conn = urlopen(bag_url) - while conn.info().get_content_type() != "application/zip": - if conn.getcode() != 200: - yield "Failed to download bag. status code {}.\n".format(conn.getcode()) - return + conn = self.urlopen(bag_url) + total_wait_time = 0 + while conn.getcode() == 200 and conn.info().get_content_type() != "application/zip": wait_time = 10 + total_wait_time += wait_time + if total_wait_time > timeout: + yield "Bag taking too long to prepare, exiting now, try again later." + return yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time) time.sleep(wait_time) - conn = urlopen(bag_url) - + conn = self.urlopen(bag_url) + if conn.getcode() != 200: + yield "Failed to download bag. status code {}.\n".format(conn.getcode()) + return # Bag creation seems to need a small time buffer after it says it's ready. time.sleep(1) - filehandle, _ = urlretrieve(bag_url) + filehandle, _ = self._urlretrieve(bag_url) zip_file_object = zipfile.ZipFile(filehandle, 'r') yield "Downloaded, unpacking contents.\n" zip_file_object.extractall("temp") diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py new file mode 100644 index 00000000..a5a64d0a --- /dev/null +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -0,0 +1,156 @@ +import json +import os +import pytest + +from contextlib import contextmanager +from io import BytesIO +from tempfile import TemporaryDirectory, NamedTemporaryFile +from unittest.mock import patch +from urllib.request import urlopen, Request, urlretrieve +from zipfile import ZipFile + +from repo2docker.contentproviders import Hydroshare + + +def test_content_id(): + with patch.object(Hydroshare, "urlopen") as fake_urlopen: + fake_urlopen.return_value.url = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" + hydro = Hydroshare() + + hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") + assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61" + + +test_hosts = [ + ( + [ + "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", + "10.4211/hs.b8f6eae9d89241cf8b5904033460af61", + "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", + ], + { + "host": { + "hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"], + "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + }, + "resource": "b8f6eae9d89241cf8b5904033460af61", + }, + ), +] + +class MockInfo: + def __init__(self, content_type): + self.content_type = content_type + + def get_content_type(self): + return self.content_type + +class MockResponse: + def __init__(self, content_type, status_code): + self.content_type = content_type + self.status_code = status_code + self.mock_info = MockInfo(self.content_type) + + def getcode(self): + return self.status_code + + def info(self): + return self.mock_info + +@pytest.mark.parametrize("test_input,expected", test_hosts) +def test_detect_hydroshare(test_input, expected): + with patch.object(Hydroshare, "urlopen") as fake_urlopen: + fake_urlopen.return_value.url = test_input[0] + # valid Hydroshare DOIs trigger this content provider + assert Hydroshare().detect(test_input[0]) == expected + assert Hydroshare().detect(test_input[1]) == expected + assert Hydroshare().detect(test_input[2]) == expected + # only two of the three calls above have to resolve a DOI + assert fake_urlopen.call_count == 2 + + with patch.object(Hydroshare, "urlopen") as fake_urlopen: + # Don't trigger the Hydroshare content provider + assert Hydroshare().detect("/some/path/here") is None + assert Hydroshare().detect("https://example.com/path/here") is None + # don't handle DOIs that aren't from Hydroshare + fake_urlopen.return_value.url = ( + "http://joss.theoj.org/papers/10.21105/joss.01277" + ) + assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None + +@contextmanager +def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"): + with NamedTemporaryFile(suffix=".zip") as zfile: + with ZipFile(zfile.name, mode="w") as zip: + zip.writestr("{}/some-file.txt".format(prefix), "some content") + zip.writestr("{}/some-other-file.txt".format(prefix), "some more content") + + yield zfile + +def test_fetch_bag(): + # we "fetch" a local ZIP file to simulate a Hydroshare resource + with hydroshare_archive() as hydro_path: + with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200), MockResponse("application/zip", 200)]): + with patch.object(Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]): + hydro = Hydroshare() + hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61" + spec = { + "host": { + "hostname": [ + "https://www.hydroshare.org/resource/", + "http://www.hydroshare.org/resource/", + ], + "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + }, + "resource": "123456789", + } + + with TemporaryDirectory() as d: + output = [] + for l in hydro.fetch(spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + expected = set(["some-other-file.txt", "some-file.txt"]) + assert expected == unpacked_files + +def test_fetch_bag_failure(): + with hydroshare_archive() as hydro_path: + with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]): + hydro = Hydroshare() + spec = { + "host": { + "hostname": [ + "https://www.hydroshare.org/resource/", + "http://www.hydroshare.org/resource/", + ], + "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + }, + "resource": "123456789", + } + with TemporaryDirectory() as d: + output = [] + for l in hydro.fetch(spec, d): + output.append(l) + assert "Failed to download bag. status code 500.\n" == output[-1] + +def test_fetch_bag_timeout(): + with hydroshare_archive() as hydro_path: + with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]): + hydro = Hydroshare() + spec = { + "host": { + "hostname": [ + "https://www.hydroshare.org/resource/", + "http://www.hydroshare.org/resource/", + ], + "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + }, + "resource": "123456789", + } + with TemporaryDirectory() as d: + output = [] + for l in hydro.fetch(spec, d, timeout=0): + output.append(l) + assert "Bag taking too long to prepare, exiting now, try again later." == output[-1] + From 2508cc21047cc6019d08393d22cc607084853dc4 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Wed, 25 Sep 2019 10:58:45 -0600 Subject: [PATCH 13/24] fix _urlretrieve class method --- repo2docker/contentproviders/hydroshare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 60e7d5b6..79045bb7 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -32,7 +32,7 @@ class Hydroshare(DoiProvider): self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1] return {"resource": self.resource_id, "host": host} - def _urlretrieve(bag_url): + def _urlretrieve(self, bag_url): return urlretrieve(bag_url) def fetch(self, spec, output_dir, yield_output=False, timeout=120): From 7fcb67f06d433730d5534cbcaa42c5faaa2f79fe Mon Sep 17 00:00:00 2001 From: Scott Black Date: Wed, 25 Sep 2019 14:00:47 -0600 Subject: [PATCH 14/24] raise exceptions during failure and add modify date version --- repo2docker/contentproviders/hydroshare.py | 25 ++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 79045bb7..6aec6552 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -2,11 +2,13 @@ import zipfile import os import shutil import time +import json from urllib.request import urlopen, Request, urlretrieve from urllib.error import HTTPError from .doi import DoiProvider +from .base import ContentProviderException from ..utils import normalize_doi, is_doi @@ -22,15 +24,24 @@ class Hydroshare(DoiProvider): { "hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"], "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements" }, ] + def fetch_version(resource_id, host): + """Fetch resource modified date and convert to epoch int""" + json_response = json.loads(self.urlopen(host["version"].format(self.resource_id)).read()) + date = next(item for item in json_response["dates"] if item["type"] == "modified")["start_date"] + date = date.split(".")[0] + return int(time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S"))) + url = self.doi2url(doi) for host in hosts: if any([url.startswith(s) for s in host["hostname"]]): self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1] - return {"resource": self.resource_id, "host": host} + self.version = fetch_version(self.resource_id, host) + return {"resource": self.resource_id, "host": host, "version": self.version} def _urlretrieve(self, bag_url): return urlretrieve(bag_url) @@ -51,14 +62,16 @@ class Hydroshare(DoiProvider): wait_time = 10 total_wait_time += wait_time if total_wait_time > timeout: - yield "Bag taking too long to prepare, exiting now, try again later." - return + msg = "Bag taking too long to prepare, exiting now, try again later." + yield msg + raise ContentProviderException(msg) yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time) time.sleep(wait_time) conn = self.urlopen(bag_url) if conn.getcode() != 200: - yield "Failed to download bag. status code {}.\n".format(conn.getcode()) - return + msg = "Failed to download bag. status code {}.\n".format(conn.getcode()) + yield msg + raise ContentProviderException(msg) # Bag creation seems to need a small time buffer after it says it's ready. time.sleep(1) filehandle, _ = self._urlretrieve(bag_url) @@ -76,4 +89,4 @@ class Hydroshare(DoiProvider): @property def content_id(self): """The HydroShare resource ID""" - return self.resource_id + return "{}.v{}".format(self.resource_id, self.version) From 21e61f30997f41c426c1937b5c48077351d41df6 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Wed, 25 Sep 2019 14:37:19 -0600 Subject: [PATCH 15/24] add hydroshare resource versioning and raise exception fetch fails --- repo2docker/contentproviders/hydroshare.py | 4 +- .../unit/contentproviders/test_hydroshare.py | 78 ++++++++++++------- 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 6aec6552..17e6116b 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -29,11 +29,11 @@ class Hydroshare(DoiProvider): ] def fetch_version(resource_id, host): - """Fetch resource modified date and convert to epoch int""" + """Fetch resource modified date and convert to epoch""" json_response = json.loads(self.urlopen(host["version"].format(self.resource_id)).read()) date = next(item for item in json_response["dates"] if item["type"] == "modified")["start_date"] date = date.split(".")[0] - return int(time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S"))) + return str(int(time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S")))) url = self.doi2url(doi) diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py index a5a64d0a..e33a7d38 100644 --- a/tests/unit/contentproviders/test_hydroshare.py +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -10,16 +10,19 @@ from urllib.request import urlopen, Request, urlretrieve from zipfile import ZipFile from repo2docker.contentproviders import Hydroshare +from repo2docker.contentproviders.base import ContentProviderException def test_content_id(): with patch.object(Hydroshare, "urlopen") as fake_urlopen: fake_urlopen.return_value.url = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" + def read(): + return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + fake_urlopen.return_value.read = read hydro = Hydroshare() hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") - assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61" - + assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569449357" test_hosts = [ ( @@ -32,41 +35,27 @@ test_hosts = [ "host": { "hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"], "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", + "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", }, "resource": "b8f6eae9d89241cf8b5904033460af61", + "version": "1569449357" }, ), ] -class MockInfo: - def __init__(self, content_type): - self.content_type = content_type - - def get_content_type(self): - return self.content_type - -class MockResponse: - def __init__(self, content_type, status_code): - self.content_type = content_type - self.status_code = status_code - self.mock_info = MockInfo(self.content_type) - - def getcode(self): - return self.status_code - - def info(self): - return self.mock_info - @pytest.mark.parametrize("test_input,expected", test_hosts) def test_detect_hydroshare(test_input, expected): with patch.object(Hydroshare, "urlopen") as fake_urlopen: fake_urlopen.return_value.url = test_input[0] + def read(): + return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + fake_urlopen.return_value.read = read # valid Hydroshare DOIs trigger this content provider assert Hydroshare().detect(test_input[0]) == expected assert Hydroshare().detect(test_input[1]) == expected assert Hydroshare().detect(test_input[2]) == expected # only two of the three calls above have to resolve a DOI - assert fake_urlopen.call_count == 2 + assert fake_urlopen.call_count == 5 with patch.object(Hydroshare, "urlopen") as fake_urlopen: # Don't trigger the Hydroshare content provider @@ -76,6 +65,9 @@ def test_detect_hydroshare(test_input, expected): fake_urlopen.return_value.url = ( "http://joss.theoj.org/papers/10.21105/joss.01277" ) + def read(): + return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + fake_urlopen.return_value.read = read assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None @contextmanager @@ -87,6 +79,28 @@ def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"): yield zfile +class MockInfo: + def __init__(self, content_type): + self.content_type = content_type + + def get_content_type(self): + return self.content_type + +class MockResponse: + def __init__(self, content_type, status_code): + self.content_type = content_type + self.status_code = status_code + self.mock_info = MockInfo(self.content_type) + + def getcode(self): + return self.status_code + + def info(self): + return self.mock_info + + def read(): + return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + def test_fetch_bag(): # we "fetch" a local ZIP file to simulate a Hydroshare resource with hydroshare_archive() as hydro_path: @@ -130,9 +144,13 @@ def test_fetch_bag_failure(): } with TemporaryDirectory() as d: output = [] - for l in hydro.fetch(spec, d): - output.append(l) - assert "Failed to download bag. status code 500.\n" == output[-1] + try: + for l in hydro.fetch(spec, d): + output.append(l) + print("ContentProviderException should have been thrown") + assert False + except ContentProviderException: + assert "Failed to download bag. status code 500.\n" == output[-1] def test_fetch_bag_timeout(): with hydroshare_archive() as hydro_path: @@ -150,7 +168,11 @@ def test_fetch_bag_timeout(): } with TemporaryDirectory() as d: output = [] - for l in hydro.fetch(spec, d, timeout=0): - output.append(l) - assert "Bag taking too long to prepare, exiting now, try again later." == output[-1] + try: + for l in hydro.fetch(spec, d, timeout=0): + output.append(l) + print("ContentProviderException should have been thrown") + assert False + except ContentProviderException: + assert "Bag taking too long to prepare, exiting now, try again later." == output[-1] From dddc45acfb4789ac9f4a52963f35b7102e74811e Mon Sep 17 00:00:00 2001 From: Scott Black Date: Mon, 30 Sep 2019 13:50:24 -0600 Subject: [PATCH 16/24] reformatting with lint --- repo2docker/contentproviders/hydroshare.py | 39 ++++++++----- .../unit/contentproviders/test_hydroshare.py | 58 +++++++++++++++---- 2 files changed, 72 insertions(+), 25 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 17e6116b..7072a34e 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -16,22 +16,26 @@ class Hydroshare(DoiProvider): """Provide contents of a Hydroshare resource.""" def detect(self, doi, ref=None, extra_args=None): - """Trigger this provider for things that resolve to a Zenodo/Invenio record""" - # We need the hostname (url where records are), api url (for metadata), - # filepath (path to files in metadata), filename (path to filename in - # metadata), download (path to file download URL), and type (path to item type in metadata) + """Trigger this provider for things that resolve to a Hydroshare resource""" hosts = [ { - "hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"], + "hostname": [ + "https://www.hydroshare.org/resource/", + "http://www.hydroshare.org/resource/", + ], "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", - "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements" - }, + "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", + } ] def fetch_version(resource_id, host): """Fetch resource modified date and convert to epoch""" - json_response = json.loads(self.urlopen(host["version"].format(self.resource_id)).read()) - date = next(item for item in json_response["dates"] if item["type"] == "modified")["start_date"] + json_response = json.loads( + self.urlopen(host["version"].format(self.resource_id)).read() + ) + date = next( + item for item in json_response["dates"] if item["type"] == "modified" + )["start_date"] date = date.split(".")[0] return str(int(time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S")))) @@ -41,7 +45,11 @@ class Hydroshare(DoiProvider): if any([url.startswith(s) for s in host["hostname"]]): self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1] self.version = fetch_version(self.resource_id, host) - return {"resource": self.resource_id, "host": host, "version": self.version} + return { + "resource": self.resource_id, + "host": host, + "version": self.version, + } def _urlretrieve(self, bag_url): return urlretrieve(bag_url) @@ -58,14 +66,19 @@ class Hydroshare(DoiProvider): # bag downloads are prepared on demand and may need some time conn = self.urlopen(bag_url) total_wait_time = 0 - while conn.getcode() == 200 and conn.info().get_content_type() != "application/zip": + while ( + conn.getcode() == 200 + and conn.info().get_content_type() != "application/zip" + ): wait_time = 10 total_wait_time += wait_time if total_wait_time > timeout: msg = "Bag taking too long to prepare, exiting now, try again later." yield msg raise ContentProviderException(msg) - yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time) + yield "Bag is being prepared, requesting again in {} seconds.\n".format( + wait_time + ) time.sleep(wait_time) conn = self.urlopen(bag_url) if conn.getcode() != 200: @@ -75,7 +88,7 @@ class Hydroshare(DoiProvider): # Bag creation seems to need a small time buffer after it says it's ready. time.sleep(1) filehandle, _ = self._urlretrieve(bag_url) - zip_file_object = zipfile.ZipFile(filehandle, 'r') + zip_file_object = zipfile.ZipFile(filehandle, "r") yield "Downloaded, unpacking contents.\n" zip_file_object.extractall("temp") # resources store the contents in the data/contents directory, which is all we want to keep diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py index e33a7d38..53604134 100644 --- a/tests/unit/contentproviders/test_hydroshare.py +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -15,15 +15,20 @@ from repo2docker.contentproviders.base import ContentProviderException def test_content_id(): with patch.object(Hydroshare, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" + fake_urlopen.return_value.url = ( + "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" + ) + def read(): return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + fake_urlopen.return_value.read = read hydro = Hydroshare() hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569449357" + test_hosts = [ ( [ @@ -33,22 +38,28 @@ test_hosts = [ ], { "host": { - "hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"], + "hostname": [ + "https://www.hydroshare.org/resource/", + "http://www.hydroshare.org/resource/", + ], "django_irods": "https://www.hydroshare.org/django_irods/download/bags/", "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", }, "resource": "b8f6eae9d89241cf8b5904033460af61", - "version": "1569449357" + "version": "1569449357", }, - ), + ) ] + @pytest.mark.parametrize("test_input,expected", test_hosts) def test_detect_hydroshare(test_input, expected): with patch.object(Hydroshare, "urlopen") as fake_urlopen: fake_urlopen.return_value.url = test_input[0] + def read(): return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + fake_urlopen.return_value.read = read # valid Hydroshare DOIs trigger this content provider assert Hydroshare().detect(test_input[0]) == expected @@ -65,11 +76,14 @@ def test_detect_hydroshare(test_input, expected): fake_urlopen.return_value.url = ( "http://joss.theoj.org/papers/10.21105/joss.01277" ) + def read(): return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + fake_urlopen.return_value.read = read assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None + @contextmanager def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"): with NamedTemporaryFile(suffix=".zip") as zfile: @@ -79,6 +93,7 @@ def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"): yield zfile + class MockInfo: def __init__(self, content_type): self.content_type = content_type @@ -86,6 +101,7 @@ class MockInfo: def get_content_type(self): return self.content_type + class MockResponse: def __init__(self, content_type, status_code): self.content_type = content_type @@ -101,11 +117,21 @@ class MockResponse: def read(): return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + def test_fetch_bag(): # we "fetch" a local ZIP file to simulate a Hydroshare resource with hydroshare_archive() as hydro_path: - with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200), MockResponse("application/zip", 200)]): - with patch.object(Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]): + with patch.object( + Hydroshare, + "urlopen", + side_effect=[ + MockResponse("application/html", 200), + MockResponse("application/zip", 200), + ], + ): + with patch.object( + Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)] + ): hydro = Hydroshare() hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61" spec = { @@ -118,19 +144,22 @@ def test_fetch_bag(): }, "resource": "123456789", } - + with TemporaryDirectory() as d: output = [] for l in hydro.fetch(spec, d): output.append(l) - + unpacked_files = set(os.listdir(d)) expected = set(["some-other-file.txt", "some-file.txt"]) assert expected == unpacked_files + def test_fetch_bag_failure(): with hydroshare_archive() as hydro_path: - with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]): + with patch.object( + Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)] + ): hydro = Hydroshare() spec = { "host": { @@ -152,9 +181,12 @@ def test_fetch_bag_failure(): except ContentProviderException: assert "Failed to download bag. status code 500.\n" == output[-1] + def test_fetch_bag_timeout(): with hydroshare_archive() as hydro_path: - with patch.object(Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]): + with patch.object( + Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)] + ): hydro = Hydroshare() spec = { "host": { @@ -174,5 +206,7 @@ def test_fetch_bag_timeout(): print("ContentProviderException should have been thrown") assert False except ContentProviderException: - assert "Bag taking too long to prepare, exiting now, try again later." == output[-1] - + assert ( + "Bag taking too long to prepare, exiting now, try again later." + == output[-1] + ) From 8dd9b5b928bd9adb9425ef70996854ff46f7cced Mon Sep 17 00:00:00 2001 From: Scott Black Date: Mon, 30 Sep 2019 15:57:18 -0600 Subject: [PATCH 17/24] update date parser to parse hydroshare date with timezone --- repo2docker/contentproviders/hydroshare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index 7072a34e..cdc3a291 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -3,6 +3,7 @@ import os import shutil import time import json +import dateutil.parser from urllib.request import urlopen, Request, urlretrieve from urllib.error import HTTPError @@ -36,8 +37,7 @@ class Hydroshare(DoiProvider): date = next( item for item in json_response["dates"] if item["type"] == "modified" )["start_date"] - date = date.split(".")[0] - return str(int(time.mktime(time.strptime(date, "%Y-%m-%dT%H:%M:%S")))) + return str(int(time.mktime(dateutil.parser.parse(date)))) url = self.doi2url(doi) From 3a936412afb6cfc278e88647c415ad98561321e4 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Mon, 30 Sep 2019 16:59:01 -0600 Subject: [PATCH 18/24] use datetime for iso parsing --- repo2docker/contentproviders/hydroshare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index cdc3a291..ff0f22a8 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -3,7 +3,7 @@ import os import shutil import time import json -import dateutil.parser +import datetime from urllib.request import urlopen, Request, urlretrieve from urllib.error import HTTPError @@ -37,7 +37,7 @@ class Hydroshare(DoiProvider): date = next( item for item in json_response["dates"] if item["type"] == "modified" )["start_date"] - return str(int(time.mktime(dateutil.parser.parse(date)))) + return datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").timestamp() url = self.doi2url(doi) From d657453b3ff05c8657f9d92d7c42362c5fd52791 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Tue, 1 Oct 2019 11:16:34 -0600 Subject: [PATCH 19/24] update hydroshare version fetch to assume timestamp is in utc --- repo2docker/contentproviders/hydroshare.py | 9 +++++++-- tests/unit/contentproviders/test_hydroshare.py | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index ff0f22a8..f0a8b0d1 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -3,7 +3,7 @@ import os import shutil import time import json -import datetime +from datetime import datetime, timezone, timedelta from urllib.request import urlopen, Request, urlretrieve from urllib.error import HTTPError @@ -37,7 +37,12 @@ class Hydroshare(DoiProvider): date = next( item for item in json_response["dates"] if item["type"] == "modified" )["start_date"] - return datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").timestamp() + # Hydroshare timestamp always returns the same timezone, so strip it + date = date.split(".")[0] + parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S") + epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp() + # truncate the timestamp + return str(int(epoch)) url = self.doi2url(doi) diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py index 53604134..ee96b587 100644 --- a/tests/unit/contentproviders/test_hydroshare.py +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -26,7 +26,7 @@ def test_content_id(): hydro = Hydroshare() hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") - assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569449357" + assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757" test_hosts = [ @@ -46,7 +46,7 @@ test_hosts = [ "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", }, "resource": "b8f6eae9d89241cf8b5904033460af61", - "version": "1569449357", + "version": "1569427757", }, ) ] From 16411fc35e3fa05b301f67e4b455719308d14aac Mon Sep 17 00:00:00 2001 From: Scott Black Date: Tue, 1 Oct 2019 12:59:06 -0600 Subject: [PATCH 20/24] update formatting --- repo2docker/contentproviders/hydroshare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index f0a8b0d1..b300ce50 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -39,7 +39,7 @@ class Hydroshare(DoiProvider): )["start_date"] # Hydroshare timestamp always returns the same timezone, so strip it date = date.split(".")[0] - parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S") + parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S") epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp() # truncate the timestamp return str(int(epoch)) From 1257a2c9106dfaa95be2287b70b41ee816cf7098 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Mon, 14 Oct 2019 13:56:29 -0600 Subject: [PATCH 21/24] update downloading message to include full url --- repo2docker/contentproviders/hydroshare.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index b300ce50..cc115cfe 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -64,10 +64,10 @@ class Hydroshare(DoiProvider): resource_id = spec["resource"] host = spec["host"] - yield "Fetching HydroShare Resource {}.\n".format(resource_id) - bag_url = "{}{}".format(host["django_irods"], resource_id) + yield "Downloading {}.\n".format(bag_url) + # bag downloads are prepared on demand and may need some time conn = self.urlopen(bag_url) total_wait_time = 0 From 776d538689ca28f5196d3f306d6e7bb4c5fbb376 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Fri, 6 Dec 2019 16:22:18 -0700 Subject: [PATCH 22/24] update tests and code to follow best practices pointed out in review --- repo2docker/contentproviders/hydroshare.py | 37 ++++++++--------- .../unit/contentproviders/test_hydroshare.py | 40 +++++++------------ 2 files changed, 31 insertions(+), 46 deletions(-) mode change 100644 => 100755 tests/unit/contentproviders/test_hydroshare.py diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py index cc115cfe..2531aa8b 100755 --- a/repo2docker/contentproviders/hydroshare.py +++ b/repo2docker/contentproviders/hydroshare.py @@ -5,17 +5,30 @@ import time import json from datetime import datetime, timezone, timedelta -from urllib.request import urlopen, Request, urlretrieve -from urllib.error import HTTPError +from urllib.request import urlretrieve from .doi import DoiProvider from .base import ContentProviderException -from ..utils import normalize_doi, is_doi class Hydroshare(DoiProvider): """Provide contents of a Hydroshare resource.""" + def _fetch_version(self, host): + """Fetch resource modified date and convert to epoch""" + json_response = json.loads( + self.urlopen(host["version"].format(self.resource_id)).read() + ) + date = next( + item for item in json_response["dates"] if item["type"] == "modified" + )["start_date"] + # Hydroshare timestamp always returns the same timezone, so strip it + date = date.split(".")[0] + parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S") + epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp() + # truncate the timestamp + return str(int(epoch)) + def detect(self, doi, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Hydroshare resource""" hosts = [ @@ -28,28 +41,12 @@ class Hydroshare(DoiProvider): "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements", } ] - - def fetch_version(resource_id, host): - """Fetch resource modified date and convert to epoch""" - json_response = json.loads( - self.urlopen(host["version"].format(self.resource_id)).read() - ) - date = next( - item for item in json_response["dates"] if item["type"] == "modified" - )["start_date"] - # Hydroshare timestamp always returns the same timezone, so strip it - date = date.split(".")[0] - parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S") - epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp() - # truncate the timestamp - return str(int(epoch)) - url = self.doi2url(doi) for host in hosts: if any([url.startswith(s) for s in host["hostname"]]): self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1] - self.version = fetch_version(self.resource_id, host) + self.version = self._fetch_version(host) return { "resource": self.resource_id, "host": host, diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py old mode 100644 new mode 100755 index ee96b587..934775dd --- a/tests/unit/contentproviders/test_hydroshare.py +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -3,10 +3,8 @@ import os import pytest from contextlib import contextmanager -from io import BytesIO from tempfile import TemporaryDirectory, NamedTemporaryFile from unittest.mock import patch -from urllib.request import urlopen, Request, urlretrieve from zipfile import ZipFile from repo2docker.contentproviders import Hydroshare @@ -51,9 +49,8 @@ test_hosts = [ ) ] - -@pytest.mark.parametrize("test_input,expected", test_hosts) -def test_detect_hydroshare(test_input, expected): +@pytest.mark.parametrize("test_input", test_hosts) +def test_detect_hydroshare(test_input): with patch.object(Hydroshare, "urlopen") as fake_urlopen: fake_urlopen.return_value.url = test_input[0] @@ -62,10 +59,15 @@ def test_detect_hydroshare(test_input, expected): fake_urlopen.return_value.read = read # valid Hydroshare DOIs trigger this content provider + expected = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" assert Hydroshare().detect(test_input[0]) == expected + # assert a call to urlopen was called to fetch version + assert fake_urlopen.call_count == 1 assert Hydroshare().detect(test_input[1]) == expected + # assert 2 more calls were made, one to resolve the DOI and another to fetch the version + assert fake_urlopen.call_count == 3 assert Hydroshare().detect(test_input[2]) == expected - # only two of the three calls above have to resolve a DOI + # assert 2 more calls were made, one to resolve the DOI and another to fetch the version assert fake_urlopen.call_count == 5 with patch.object(Hydroshare, "urlopen") as fake_urlopen: @@ -114,7 +116,7 @@ class MockResponse: def info(self): return self.mock_info - def read(): + def read(self): return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' @@ -172,14 +174,8 @@ def test_fetch_bag_failure(): "resource": "123456789", } with TemporaryDirectory() as d: - output = [] - try: - for l in hydro.fetch(spec, d): - output.append(l) - print("ContentProviderException should have been thrown") - assert False - except ContentProviderException: - assert "Failed to download bag. status code 500.\n" == output[-1] + with pytest.raises(ContentProviderException, match="Failed to download bag. status code 500.\n"): + hydro.fetch(spec, d) def test_fetch_bag_timeout(): @@ -199,14 +195,6 @@ def test_fetch_bag_timeout(): "resource": "123456789", } with TemporaryDirectory() as d: - output = [] - try: - for l in hydro.fetch(spec, d, timeout=0): - output.append(l) - print("ContentProviderException should have been thrown") - assert False - except ContentProviderException: - assert ( - "Bag taking too long to prepare, exiting now, try again later." - == output[-1] - ) + with pytest.raises(ContentProviderException, + match="Bag taking too long to prepare, exiting now, try again later."): + hydro.fetch(spec, d, timeout=0) From cb7bb0bb43785c5589bb430d260f996673bbdfd5 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Fri, 6 Dec 2019 17:36:50 -0700 Subject: [PATCH 23/24] remove test parameterization and use fix regex match --- tests/unit/contentproviders/test_hydroshare.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py index 934775dd..22fed715 100755 --- a/tests/unit/contentproviders/test_hydroshare.py +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -49,10 +49,9 @@ test_hosts = [ ) ] -@pytest.mark.parametrize("test_input", test_hosts) -def test_detect_hydroshare(test_input): +def test_detect_hydroshare(): with patch.object(Hydroshare, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = test_input[0] + fake_urlopen.return_value.url = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" def read(): return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' @@ -60,13 +59,13 @@ def test_detect_hydroshare(test_input): fake_urlopen.return_value.read = read # valid Hydroshare DOIs trigger this content provider expected = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - assert Hydroshare().detect(test_input[0]) == expected + assert Hydroshare().detect("https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61") == expected # assert a call to urlopen was called to fetch version assert fake_urlopen.call_count == 1 - assert Hydroshare().detect(test_input[1]) == expected + assert Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected # assert 2 more calls were made, one to resolve the DOI and another to fetch the version assert fake_urlopen.call_count == 3 - assert Hydroshare().detect(test_input[2]) == expected + assert Hydroshare().detect("https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected # assert 2 more calls were made, one to resolve the DOI and another to fetch the version assert fake_urlopen.call_count == 5 @@ -174,7 +173,7 @@ def test_fetch_bag_failure(): "resource": "123456789", } with TemporaryDirectory() as d: - with pytest.raises(ContentProviderException, match="Failed to download bag. status code 500.\n"): + with pytest.raises(ContentProviderException, match=r"Failed to download bag. status code 500.\n"): hydro.fetch(spec, d) @@ -196,5 +195,5 @@ def test_fetch_bag_timeout(): } with TemporaryDirectory() as d: with pytest.raises(ContentProviderException, - match="Bag taking too long to prepare, exiting now, try again later."): + match=r"Bag taking too long to prepare, exiting now, try again later."): hydro.fetch(spec, d, timeout=0) From 0c15cb125169ba92e8b2935330958717ad393cf6 Mon Sep 17 00:00:00 2001 From: Scott Black Date: Sat, 7 Dec 2019 13:34:49 -0700 Subject: [PATCH 24/24] cleanup tests and formatting --- .../unit/contentproviders/test_hydroshare.py | 81 ++++++++++--------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py index 22fed715..d662dbff 100755 --- a/tests/unit/contentproviders/test_hydroshare.py +++ b/tests/unit/contentproviders/test_hydroshare.py @@ -1,4 +1,3 @@ -import json import os import pytest @@ -27,14 +26,18 @@ def test_content_id(): assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757" -test_hosts = [ - ( - [ - "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", - "10.4211/hs.b8f6eae9d89241cf8b5904033460af61", - "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", - ], - { +def test_detect_hydroshare(): + with patch.object(Hydroshare, "urlopen") as fake_urlopen: + fake_urlopen.return_value.url = ( + "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" + ) + + def read(): + return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' + + fake_urlopen.return_value.read = read + # valid Hydroshare DOIs trigger this content provider + expected = { "host": { "hostname": [ "https://www.hydroshare.org/resource/", @@ -45,27 +48,27 @@ test_hosts = [ }, "resource": "b8f6eae9d89241cf8b5904033460af61", "version": "1569427757", - }, - ) -] - -def test_detect_hydroshare(): - with patch.object(Hydroshare, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - - def read(): - return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' - - fake_urlopen.return_value.read = read - # valid Hydroshare DOIs trigger this content provider - expected = "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" - assert Hydroshare().detect("https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61") == expected + } + assert ( + Hydroshare().detect( + "https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61" + ) + == expected + ) # assert a call to urlopen was called to fetch version assert fake_urlopen.call_count == 1 - assert Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected + assert ( + Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61") + == expected + ) # assert 2 more calls were made, one to resolve the DOI and another to fetch the version assert fake_urlopen.call_count == 3 - assert Hydroshare().detect("https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61") == expected + assert ( + Hydroshare().detect( + "https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61" + ) + == expected + ) # assert 2 more calls were made, one to resolve the DOI and another to fetch the version assert fake_urlopen.call_count == 5 @@ -115,9 +118,6 @@ class MockResponse: def info(self): return self.mock_info - def read(self): - return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}' - def test_fetch_bag(): # we "fetch" a local ZIP file to simulate a Hydroshare resource @@ -157,7 +157,7 @@ def test_fetch_bag(): def test_fetch_bag_failure(): - with hydroshare_archive() as hydro_path: + with hydroshare_archive(): with patch.object( Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)] ): @@ -173,12 +173,17 @@ def test_fetch_bag_failure(): "resource": "123456789", } with TemporaryDirectory() as d: - with pytest.raises(ContentProviderException, match=r"Failed to download bag. status code 500.\n"): - hydro.fetch(spec, d) + with pytest.raises( + ContentProviderException, + match=r"Failed to download bag\. status code 500\.", + ): + # loop for yield statements + for l in hydro.fetch(spec, d): + pass def test_fetch_bag_timeout(): - with hydroshare_archive() as hydro_path: + with hydroshare_archive(): with patch.object( Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)] ): @@ -194,6 +199,10 @@ def test_fetch_bag_timeout(): "resource": "123456789", } with TemporaryDirectory() as d: - with pytest.raises(ContentProviderException, - match=r"Bag taking too long to prepare, exiting now, try again later."): - hydro.fetch(spec, d, timeout=0) + with pytest.raises( + ContentProviderException, + match=r"Bag taking too long to prepare, exiting now, try again later\.", + ): + # loop for yield statements + for l in hydro.fetch(spec, d, timeout=0): + pass