kopia lustrzana https://github.com/jupyterhub/repo2docker
[MRG] add Hydroshare as a content provider (#800)
[MRG] add Hydroshare as a content providerpull/842/head
commit
900d280561
|
@ -147,6 +147,7 @@ class Repo2Docker(Application):
|
||||||
contentproviders.Zenodo,
|
contentproviders.Zenodo,
|
||||||
contentproviders.Figshare,
|
contentproviders.Figshare,
|
||||||
contentproviders.Dataverse,
|
contentproviders.Dataverse,
|
||||||
|
contentproviders.Hydroshare,
|
||||||
contentproviders.Git,
|
contentproviders.Git,
|
||||||
],
|
],
|
||||||
config=True,
|
config=True,
|
||||||
|
|
|
@ -3,3 +3,4 @@ from .base import Local
|
||||||
from .zenodo import Zenodo
|
from .zenodo import Zenodo
|
||||||
from .figshare import Figshare
|
from .figshare import Figshare
|
||||||
from .dataverse import Dataverse
|
from .dataverse import Dataverse
|
||||||
|
from .hydroshare import Hydroshare
|
||||||
|
|
|
@ -0,0 +1,107 @@
|
||||||
|
import zipfile
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
|
||||||
|
from urllib.request import urlretrieve
|
||||||
|
|
||||||
|
from .doi import DoiProvider
|
||||||
|
from .base import ContentProviderException
|
||||||
|
|
||||||
|
|
||||||
|
class Hydroshare(DoiProvider):
|
||||||
|
"""Provide contents of a Hydroshare resource."""
|
||||||
|
|
||||||
|
def _fetch_version(self, host):
|
||||||
|
"""Fetch resource modified date and convert to epoch"""
|
||||||
|
json_response = json.loads(
|
||||||
|
self.urlopen(host["version"].format(self.resource_id)).read()
|
||||||
|
)
|
||||||
|
date = next(
|
||||||
|
item for item in json_response["dates"] if item["type"] == "modified"
|
||||||
|
)["start_date"]
|
||||||
|
# Hydroshare timestamp always returns the same timezone, so strip it
|
||||||
|
date = date.split(".")[0]
|
||||||
|
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
|
||||||
|
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
|
||||||
|
# truncate the timestamp
|
||||||
|
return str(int(epoch))
|
||||||
|
|
||||||
|
def detect(self, doi, ref=None, extra_args=None):
|
||||||
|
"""Trigger this provider for things that resolve to a Hydroshare resource"""
|
||||||
|
hosts = [
|
||||||
|
{
|
||||||
|
"hostname": [
|
||||||
|
"https://www.hydroshare.org/resource/",
|
||||||
|
"http://www.hydroshare.org/resource/",
|
||||||
|
],
|
||||||
|
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||||
|
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
url = self.doi2url(doi)
|
||||||
|
|
||||||
|
for host in hosts:
|
||||||
|
if any([url.startswith(s) for s in host["hostname"]]):
|
||||||
|
self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
|
||||||
|
self.version = self._fetch_version(host)
|
||||||
|
return {
|
||||||
|
"resource": self.resource_id,
|
||||||
|
"host": host,
|
||||||
|
"version": self.version,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _urlretrieve(self, bag_url):
|
||||||
|
return urlretrieve(bag_url)
|
||||||
|
|
||||||
|
def fetch(self, spec, output_dir, yield_output=False, timeout=120):
|
||||||
|
"""Fetch and unpack a Hydroshare resource"""
|
||||||
|
resource_id = spec["resource"]
|
||||||
|
host = spec["host"]
|
||||||
|
|
||||||
|
bag_url = "{}{}".format(host["django_irods"], resource_id)
|
||||||
|
|
||||||
|
yield "Downloading {}.\n".format(bag_url)
|
||||||
|
|
||||||
|
# bag downloads are prepared on demand and may need some time
|
||||||
|
conn = self.urlopen(bag_url)
|
||||||
|
total_wait_time = 0
|
||||||
|
while (
|
||||||
|
conn.getcode() == 200
|
||||||
|
and conn.info().get_content_type() != "application/zip"
|
||||||
|
):
|
||||||
|
wait_time = 10
|
||||||
|
total_wait_time += wait_time
|
||||||
|
if total_wait_time > timeout:
|
||||||
|
msg = "Bag taking too long to prepare, exiting now, try again later."
|
||||||
|
yield msg
|
||||||
|
raise ContentProviderException(msg)
|
||||||
|
yield "Bag is being prepared, requesting again in {} seconds.\n".format(
|
||||||
|
wait_time
|
||||||
|
)
|
||||||
|
time.sleep(wait_time)
|
||||||
|
conn = self.urlopen(bag_url)
|
||||||
|
if conn.getcode() != 200:
|
||||||
|
msg = "Failed to download bag. status code {}.\n".format(conn.getcode())
|
||||||
|
yield msg
|
||||||
|
raise ContentProviderException(msg)
|
||||||
|
# Bag creation seems to need a small time buffer after it says it's ready.
|
||||||
|
time.sleep(1)
|
||||||
|
filehandle, _ = self._urlretrieve(bag_url)
|
||||||
|
zip_file_object = zipfile.ZipFile(filehandle, "r")
|
||||||
|
yield "Downloaded, unpacking contents.\n"
|
||||||
|
zip_file_object.extractall("temp")
|
||||||
|
# resources store the contents in the data/contents directory, which is all we want to keep
|
||||||
|
contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
|
||||||
|
files = os.listdir(contents_dir)
|
||||||
|
for f in files:
|
||||||
|
shutil.move(os.path.join(contents_dir, f), output_dir)
|
||||||
|
yield "Finished, cleaning up.\n"
|
||||||
|
shutil.rmtree("temp")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def content_id(self):
|
||||||
|
"""The HydroShare resource ID"""
|
||||||
|
return "{}.v{}".format(self.resource_id, self.version)
|
|
@ -0,0 +1,208 @@
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from tempfile import TemporaryDirectory, NamedTemporaryFile
|
||||||
|
from unittest.mock import patch
|
||||||
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
from repo2docker.contentproviders import Hydroshare
|
||||||
|
from repo2docker.contentproviders.base import ContentProviderException
|
||||||
|
|
||||||
|
|
||||||
|
def test_content_id():
|
||||||
|
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||||
|
fake_urlopen.return_value.url = (
|
||||||
|
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
|
||||||
|
)
|
||||||
|
|
||||||
|
def read():
|
||||||
|
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
|
||||||
|
|
||||||
|
fake_urlopen.return_value.read = read
|
||||||
|
hydro = Hydroshare()
|
||||||
|
|
||||||
|
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
|
||||||
|
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_hydroshare():
|
||||||
|
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||||
|
fake_urlopen.return_value.url = (
|
||||||
|
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
|
||||||
|
)
|
||||||
|
|
||||||
|
def read():
|
||||||
|
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
|
||||||
|
|
||||||
|
fake_urlopen.return_value.read = read
|
||||||
|
# valid Hydroshare DOIs trigger this content provider
|
||||||
|
expected = {
|
||||||
|
"host": {
|
||||||
|
"hostname": [
|
||||||
|
"https://www.hydroshare.org/resource/",
|
||||||
|
"http://www.hydroshare.org/resource/",
|
||||||
|
],
|
||||||
|
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||||
|
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
|
||||||
|
},
|
||||||
|
"resource": "b8f6eae9d89241cf8b5904033460af61",
|
||||||
|
"version": "1569427757",
|
||||||
|
}
|
||||||
|
assert (
|
||||||
|
Hydroshare().detect(
|
||||||
|
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
|
||||||
|
)
|
||||||
|
== expected
|
||||||
|
)
|
||||||
|
# assert a call to urlopen was called to fetch version
|
||||||
|
assert fake_urlopen.call_count == 1
|
||||||
|
assert (
|
||||||
|
Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
|
||||||
|
== expected
|
||||||
|
)
|
||||||
|
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version
|
||||||
|
assert fake_urlopen.call_count == 3
|
||||||
|
assert (
|
||||||
|
Hydroshare().detect(
|
||||||
|
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61"
|
||||||
|
)
|
||||||
|
== expected
|
||||||
|
)
|
||||||
|
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version
|
||||||
|
assert fake_urlopen.call_count == 5
|
||||||
|
|
||||||
|
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
|
||||||
|
# Don't trigger the Hydroshare content provider
|
||||||
|
assert Hydroshare().detect("/some/path/here") is None
|
||||||
|
assert Hydroshare().detect("https://example.com/path/here") is None
|
||||||
|
# don't handle DOIs that aren't from Hydroshare
|
||||||
|
fake_urlopen.return_value.url = (
|
||||||
|
"http://joss.theoj.org/papers/10.21105/joss.01277"
|
||||||
|
)
|
||||||
|
|
||||||
|
def read():
|
||||||
|
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
|
||||||
|
|
||||||
|
fake_urlopen.return_value.read = read
|
||||||
|
assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"):
|
||||||
|
with NamedTemporaryFile(suffix=".zip") as zfile:
|
||||||
|
with ZipFile(zfile.name, mode="w") as zip:
|
||||||
|
zip.writestr("{}/some-file.txt".format(prefix), "some content")
|
||||||
|
zip.writestr("{}/some-other-file.txt".format(prefix), "some more content")
|
||||||
|
|
||||||
|
yield zfile
|
||||||
|
|
||||||
|
|
||||||
|
class MockInfo:
|
||||||
|
def __init__(self, content_type):
|
||||||
|
self.content_type = content_type
|
||||||
|
|
||||||
|
def get_content_type(self):
|
||||||
|
return self.content_type
|
||||||
|
|
||||||
|
|
||||||
|
class MockResponse:
|
||||||
|
def __init__(self, content_type, status_code):
|
||||||
|
self.content_type = content_type
|
||||||
|
self.status_code = status_code
|
||||||
|
self.mock_info = MockInfo(self.content_type)
|
||||||
|
|
||||||
|
def getcode(self):
|
||||||
|
return self.status_code
|
||||||
|
|
||||||
|
def info(self):
|
||||||
|
return self.mock_info
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_bag():
|
||||||
|
# we "fetch" a local ZIP file to simulate a Hydroshare resource
|
||||||
|
with hydroshare_archive() as hydro_path:
|
||||||
|
with patch.object(
|
||||||
|
Hydroshare,
|
||||||
|
"urlopen",
|
||||||
|
side_effect=[
|
||||||
|
MockResponse("application/html", 200),
|
||||||
|
MockResponse("application/zip", 200),
|
||||||
|
],
|
||||||
|
):
|
||||||
|
with patch.object(
|
||||||
|
Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]
|
||||||
|
):
|
||||||
|
hydro = Hydroshare()
|
||||||
|
hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61"
|
||||||
|
spec = {
|
||||||
|
"host": {
|
||||||
|
"hostname": [
|
||||||
|
"https://www.hydroshare.org/resource/",
|
||||||
|
"http://www.hydroshare.org/resource/",
|
||||||
|
],
|
||||||
|
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||||
|
},
|
||||||
|
"resource": "123456789",
|
||||||
|
}
|
||||||
|
|
||||||
|
with TemporaryDirectory() as d:
|
||||||
|
output = []
|
||||||
|
for l in hydro.fetch(spec, d):
|
||||||
|
output.append(l)
|
||||||
|
|
||||||
|
unpacked_files = set(os.listdir(d))
|
||||||
|
expected = set(["some-other-file.txt", "some-file.txt"])
|
||||||
|
assert expected == unpacked_files
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_bag_failure():
|
||||||
|
with hydroshare_archive():
|
||||||
|
with patch.object(
|
||||||
|
Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]
|
||||||
|
):
|
||||||
|
hydro = Hydroshare()
|
||||||
|
spec = {
|
||||||
|
"host": {
|
||||||
|
"hostname": [
|
||||||
|
"https://www.hydroshare.org/resource/",
|
||||||
|
"http://www.hydroshare.org/resource/",
|
||||||
|
],
|
||||||
|
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||||
|
},
|
||||||
|
"resource": "123456789",
|
||||||
|
}
|
||||||
|
with TemporaryDirectory() as d:
|
||||||
|
with pytest.raises(
|
||||||
|
ContentProviderException,
|
||||||
|
match=r"Failed to download bag\. status code 500\.",
|
||||||
|
):
|
||||||
|
# loop for yield statements
|
||||||
|
for l in hydro.fetch(spec, d):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_fetch_bag_timeout():
|
||||||
|
with hydroshare_archive():
|
||||||
|
with patch.object(
|
||||||
|
Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]
|
||||||
|
):
|
||||||
|
hydro = Hydroshare()
|
||||||
|
spec = {
|
||||||
|
"host": {
|
||||||
|
"hostname": [
|
||||||
|
"https://www.hydroshare.org/resource/",
|
||||||
|
"http://www.hydroshare.org/resource/",
|
||||||
|
],
|
||||||
|
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
|
||||||
|
},
|
||||||
|
"resource": "123456789",
|
||||||
|
}
|
||||||
|
with TemporaryDirectory() as d:
|
||||||
|
with pytest.raises(
|
||||||
|
ContentProviderException,
|
||||||
|
match=r"Bag taking too long to prepare, exiting now, try again later\.",
|
||||||
|
):
|
||||||
|
# loop for yield statements
|
||||||
|
for l in hydro.fetch(spec, d, timeout=0):
|
||||||
|
pass
|
Ładowanie…
Reference in New Issue