[MRG] add Hydroshare as a content provider (#800)

[MRG] add Hydroshare as a content provider
pull/842/head
Chris Holdgraf 2020-02-03 10:52:48 -07:00 zatwierdzone przez GitHub
commit 900d280561
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
4 zmienionych plików z 317 dodań i 0 usunięć

1
repo2docker/app.py 100644 → 100755
Wyświetl plik

@ -147,6 +147,7 @@ class Repo2Docker(Application):
contentproviders.Zenodo,
contentproviders.Figshare,
contentproviders.Dataverse,
contentproviders.Hydroshare,
contentproviders.Git,
],
config=True,

Wyświetl plik

@ -3,3 +3,4 @@ from .base import Local
from .zenodo import Zenodo
from .figshare import Figshare
from .dataverse import Dataverse
from .hydroshare import Hydroshare

Wyświetl plik

@ -0,0 +1,107 @@
import zipfile
import os
import shutil
import time
import json
from datetime import datetime, timezone, timedelta
from urllib.request import urlretrieve
from .doi import DoiProvider
from .base import ContentProviderException
class Hydroshare(DoiProvider):
"""Provide contents of a Hydroshare resource."""
def _fetch_version(self, host):
"""Fetch resource modified date and convert to epoch"""
json_response = json.loads(
self.urlopen(host["version"].format(self.resource_id)).read()
)
date = next(
item for item in json_response["dates"] if item["type"] == "modified"
)["start_date"]
# Hydroshare timestamp always returns the same timezone, so strip it
date = date.split(".")[0]
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
# truncate the timestamp
return str(int(epoch))
def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Hydroshare resource"""
hosts = [
{
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
}
]
url = self.doi2url(doi)
for host in hosts:
if any([url.startswith(s) for s in host["hostname"]]):
self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
self.version = self._fetch_version(host)
return {
"resource": self.resource_id,
"host": host,
"version": self.version,
}
def _urlretrieve(self, bag_url):
return urlretrieve(bag_url)
def fetch(self, spec, output_dir, yield_output=False, timeout=120):
"""Fetch and unpack a Hydroshare resource"""
resource_id = spec["resource"]
host = spec["host"]
bag_url = "{}{}".format(host["django_irods"], resource_id)
yield "Downloading {}.\n".format(bag_url)
# bag downloads are prepared on demand and may need some time
conn = self.urlopen(bag_url)
total_wait_time = 0
while (
conn.getcode() == 200
and conn.info().get_content_type() != "application/zip"
):
wait_time = 10
total_wait_time += wait_time
if total_wait_time > timeout:
msg = "Bag taking too long to prepare, exiting now, try again later."
yield msg
raise ContentProviderException(msg)
yield "Bag is being prepared, requesting again in {} seconds.\n".format(
wait_time
)
time.sleep(wait_time)
conn = self.urlopen(bag_url)
if conn.getcode() != 200:
msg = "Failed to download bag. status code {}.\n".format(conn.getcode())
yield msg
raise ContentProviderException(msg)
# Bag creation seems to need a small time buffer after it says it's ready.
time.sleep(1)
filehandle, _ = self._urlretrieve(bag_url)
zip_file_object = zipfile.ZipFile(filehandle, "r")
yield "Downloaded, unpacking contents.\n"
zip_file_object.extractall("temp")
# resources store the contents in the data/contents directory, which is all we want to keep
contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
files = os.listdir(contents_dir)
for f in files:
shutil.move(os.path.join(contents_dir, f), output_dir)
yield "Finished, cleaning up.\n"
shutil.rmtree("temp")
@property
def content_id(self):
"""The HydroShare resource ID"""
return "{}.v{}".format(self.resource_id, self.version)

Wyświetl plik

@ -0,0 +1,208 @@
import os
import pytest
from contextlib import contextmanager
from tempfile import TemporaryDirectory, NamedTemporaryFile
from unittest.mock import patch
from zipfile import ZipFile
from repo2docker.contentproviders import Hydroshare
from repo2docker.contentproviders.base import ContentProviderException
def test_content_id():
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = (
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
)
def read():
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
fake_urlopen.return_value.read = read
hydro = Hydroshare()
hydro.detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
assert hydro.content_id == "b8f6eae9d89241cf8b5904033460af61.v1569427757"
def test_detect_hydroshare():
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
fake_urlopen.return_value.url = (
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
)
def read():
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
fake_urlopen.return_value.read = read
# valid Hydroshare DOIs trigger this content provider
expected = {
"host": {
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
},
"resource": "b8f6eae9d89241cf8b5904033460af61",
"version": "1569427757",
}
assert (
Hydroshare().detect(
"https://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"
)
== expected
)
# assert a call to urlopen was called to fetch version
assert fake_urlopen.call_count == 1
assert (
Hydroshare().detect("10.4211/hs.b8f6eae9d89241cf8b5904033460af61")
== expected
)
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version
assert fake_urlopen.call_count == 3
assert (
Hydroshare().detect(
"https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61"
)
== expected
)
# assert 2 more calls were made, one to resolve the DOI and another to fetch the version
assert fake_urlopen.call_count == 5
with patch.object(Hydroshare, "urlopen") as fake_urlopen:
# Don't trigger the Hydroshare content provider
assert Hydroshare().detect("/some/path/here") is None
assert Hydroshare().detect("https://example.com/path/here") is None
# don't handle DOIs that aren't from Hydroshare
fake_urlopen.return_value.url = (
"http://joss.theoj.org/papers/10.21105/joss.01277"
)
def read():
return '{"dates": [{"type": "modified", "start_date": "2019-09-25T16:09:17.006152Z"}]}'
fake_urlopen.return_value.read = read
assert Hydroshare().detect("https://doi.org/10.21105/joss.01277") is None
@contextmanager
def hydroshare_archive(prefix="b8f6eae9d89241cf8b5904033460af61/data/contents"):
with NamedTemporaryFile(suffix=".zip") as zfile:
with ZipFile(zfile.name, mode="w") as zip:
zip.writestr("{}/some-file.txt".format(prefix), "some content")
zip.writestr("{}/some-other-file.txt".format(prefix), "some more content")
yield zfile
class MockInfo:
def __init__(self, content_type):
self.content_type = content_type
def get_content_type(self):
return self.content_type
class MockResponse:
def __init__(self, content_type, status_code):
self.content_type = content_type
self.status_code = status_code
self.mock_info = MockInfo(self.content_type)
def getcode(self):
return self.status_code
def info(self):
return self.mock_info
def test_fetch_bag():
# we "fetch" a local ZIP file to simulate a Hydroshare resource
with hydroshare_archive() as hydro_path:
with patch.object(
Hydroshare,
"urlopen",
side_effect=[
MockResponse("application/html", 200),
MockResponse("application/zip", 200),
],
):
with patch.object(
Hydroshare, "_urlretrieve", side_effect=[(hydro_path, None)]
):
hydro = Hydroshare()
hydro.resource_id = "b8f6eae9d89241cf8b5904033460af61"
spec = {
"host": {
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
},
"resource": "123456789",
}
with TemporaryDirectory() as d:
output = []
for l in hydro.fetch(spec, d):
output.append(l)
unpacked_files = set(os.listdir(d))
expected = set(["some-other-file.txt", "some-file.txt"])
assert expected == unpacked_files
def test_fetch_bag_failure():
with hydroshare_archive():
with patch.object(
Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 500)]
):
hydro = Hydroshare()
spec = {
"host": {
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
},
"resource": "123456789",
}
with TemporaryDirectory() as d:
with pytest.raises(
ContentProviderException,
match=r"Failed to download bag\. status code 500\.",
):
# loop for yield statements
for l in hydro.fetch(spec, d):
pass
def test_fetch_bag_timeout():
with hydroshare_archive():
with patch.object(
Hydroshare, "urlopen", side_effect=[MockResponse("application/html", 200)]
):
hydro = Hydroshare()
spec = {
"host": {
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
},
"resource": "123456789",
}
with TemporaryDirectory() as d:
with pytest.raises(
ContentProviderException,
match=r"Bag taking too long to prepare, exiting now, try again later\.",
):
# loop for yield statements
for l in hydro.fetch(spec, d, timeout=0):
pass