repo2docker/repo2docker/contentproviders/hydroshare.py

103 wiersze
3.9 KiB
Python
Executable File

import json
import os
import shutil
import time
import zipfile
from datetime import datetime, timedelta, timezone
from urllib.request import urlretrieve
from .base import ContentProviderException
from .doi import DoiProvider
class Hydroshare(DoiProvider):
"""Provide contents of a Hydroshare resource."""
def _fetch_version(self, host):
"""Fetch resource modified date and convert to epoch"""
json_response = self.urlopen(host["version"].format(self.resource_id)).json()
date = next(
item for item in json_response["dates"] if item["type"] == "modified"
)["start_date"]
# Hydroshare timestamp always returns the same timezone, so strip it
date = date.split(".")[0]
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
# truncate the timestamp
return str(int(epoch))
def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Hydroshare resource"""
hosts = [
{
"hostname": [
"https://www.hydroshare.org/resource/",
"http://www.hydroshare.org/resource/",
],
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
}
]
url = self.doi2url(doi)
for host in hosts:
if any([url.startswith(s) for s in host["hostname"]]):
self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
self.version = self._fetch_version(host)
return {
"resource": self.resource_id,
"host": host,
"version": self.version,
}
def _urlretrieve(self, bag_url):
return urlretrieve(bag_url)
def fetch(self, spec, output_dir, yield_output=False, timeout=120):
"""Fetch and unpack a Hydroshare resource"""
resource_id = spec["resource"]
host = spec["host"]
bag_url = f'{host["django_irods"]}{resource_id}'
yield f"Downloading {bag_url}.\n"
# bag downloads are prepared on demand and may need some time
conn = self.urlopen(bag_url)
total_wait_time = 0
while (
conn.status_code == 200
and conn.headers["content-type"] != "application/zip"
):
wait_time = 10
total_wait_time += wait_time
if total_wait_time > timeout:
msg = "Bag taking too long to prepare, exiting now, try again later."
yield msg
raise ContentProviderException(msg)
yield f"Bag is being prepared, requesting again in {wait_time} seconds.\n"
time.sleep(wait_time)
conn = self.urlopen(bag_url)
if conn.status_code != 200:
msg = f"Failed to download bag. status code {conn.status_code}.\n"
yield msg
raise ContentProviderException(msg)
# Bag creation seems to need a small time buffer after it says it's ready.
time.sleep(1)
filehandle, _ = self._urlretrieve(bag_url)
zip_file_object = zipfile.ZipFile(filehandle, "r")
yield "Downloaded, unpacking contents.\n"
zip_file_object.extractall("temp")
# resources store the contents in the data/contents directory, which is all we want to keep
contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
files = os.listdir(contents_dir)
for f in files:
shutil.move(os.path.join(contents_dir, f), output_dir)
yield "Finished, cleaning up.\n"
shutil.rmtree("temp")
@property
def content_id(self):
"""The HydroShare resource ID"""
return f"{self.resource_id}.v{self.version}"