repo2docker/repo2docker/contentproviders/hydroshare.py

106 wiersze
3.9 KiB
Python
Czysty Zwykły widok Historia

import zipfile
2019-08-22 23:00:55 +00:00
import os
import shutil
2019-08-23 16:56:52 +00:00
import time
import json
2019-09-30 22:59:01 +00:00
import datetime
2019-08-22 22:41:31 +00:00
from urllib.request import urlopen, Request, urlretrieve
from urllib.error import HTTPError
from .doi import DoiProvider
from .base import ContentProviderException
from ..utils import normalize_doi, is_doi
class Hydroshare(DoiProvider):
"""Provide contents of a Hydroshare resource."""
def detect(self, doi, ref=None, extra_args=None):
2019-09-30 19:50:24 +00:00
"""Trigger this provider for things that resolve to a Hydroshare resource"""
hosts = [
{
2019-09-30 19:50:24 +00:00
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
2019-09-30 19:50:24 +00:00
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
}
]
def fetch_version(resource_id, host):
"""Fetch resource modified date and convert to epoch"""
2019-09-30 19:50:24 +00:00
json_response = json.loads(
self.urlopen(host["version"].format(self.resource_id)).read()
)
date = next(
item for item in json_response["dates"] if item["type"] == "modified"
)["start_date"]
2019-09-30 22:59:01 +00:00
return datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").timestamp()
url = self.doi2url(doi)
for host in hosts:
if any([url.startswith(s) for s in host["hostname"]]):
2019-08-22 23:24:52 +00:00
self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
self.version = fetch_version(self.resource_id, host)
2019-09-30 19:50:24 +00:00
return {
"resource": self.resource_id,
"host": host,
"version": self.version,
}
2019-09-25 16:58:45 +00:00
def _urlretrieve(self, bag_url):
return urlretrieve(bag_url)
def fetch(self, spec, output_dir, yield_output=False, timeout=120):
"""Fetch and unpack a Hydroshare resource"""
resource_id = spec["resource"]
host = spec["host"]
yield "Fetching HydroShare Resource {}.\n".format(resource_id)
bag_url = "{}{}".format(host["django_irods"], resource_id)
2019-08-23 16:56:52 +00:00
# bag downloads are prepared on demand and may need some time
conn = self.urlopen(bag_url)
total_wait_time = 0
2019-09-30 19:50:24 +00:00
while (
conn.getcode() == 200
and conn.info().get_content_type() != "application/zip"
):
2019-08-23 17:09:26 +00:00
wait_time = 10
total_wait_time += wait_time
if total_wait_time > timeout:
msg = "Bag taking too long to prepare, exiting now, try again later."
yield msg
raise ContentProviderException(msg)
2019-09-30 19:50:24 +00:00
yield "Bag is being prepared, requesting again in {} seconds.\n".format(
wait_time
)
2019-08-23 17:09:26 +00:00
time.sleep(wait_time)
conn = self.urlopen(bag_url)
if conn.getcode() != 200:
msg = "Failed to download bag. status code {}.\n".format(conn.getcode())
yield msg
raise ContentProviderException(msg)
2019-08-23 17:16:39 +00:00
# Bag creation seems to need a small time buffer after it says it's ready.
time.sleep(1)
filehandle, _ = self._urlretrieve(bag_url)
2019-09-30 19:50:24 +00:00
zip_file_object = zipfile.ZipFile(filehandle, "r")
2019-08-23 17:16:39 +00:00
yield "Downloaded, unpacking contents.\n"
2019-08-22 23:00:55 +00:00
zip_file_object.extractall("temp")
2019-08-23 16:56:52 +00:00
# resources store the contents in the data/contents directory, which is all we want to keep
2019-08-22 23:10:55 +00:00
contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
files = os.listdir(contents_dir)
2019-08-22 23:00:55 +00:00
for f in files:
2019-08-22 23:10:55 +00:00
shutil.move(os.path.join(contents_dir, f), output_dir)
2019-08-23 17:16:39 +00:00
yield "Finished, cleaning up.\n"
2019-08-22 23:00:55 +00:00
shutil.rmtree("temp")
@property
def content_id(self):
"""The HydroShare resource ID"""
return "{}.v{}".format(self.resource_id, self.version)