repo2docker/repo2docker/contentproviders/hydroshare.py

import zipfile
import os
import shutil
import time
import json
import datetime

from urllib.request import urlopen, Request, urlretrieve
from urllib.error import HTTPError

from .doi import DoiProvider
from .base import ContentProviderException
from ..utils import normalize_doi, is_doi


class Hydroshare(DoiProvider):
    """Provide contents of a Hydroshare resource."""

    def detect(self, doi, ref=None, extra_args=None):
        """Trigger this provider for things that resolve to a Hydroshare resource"""
        hosts = [
            {
                "hostname": [
                    "https://www.hydroshare.org/resource/",
                    "http://www.hydroshare.org/resource/",
                ],
                "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
                "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
            }
        ]

        def fetch_version(resource_id, host):
            """Fetch resource modified date and convert to epoch"""
            json_response = json.loads(
                self.urlopen(host["version"].format(self.resource_id)).read()
            )
            date = next(
                item for item in json_response["dates"] if item["type"] == "modified"
            )["start_date"]
            return datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").timestamp()

        url = self.doi2url(doi)

        for host in hosts:
            if any([url.startswith(s) for s in host["hostname"]]):
                self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
                self.version = fetch_version(self.resource_id, host)
                return {
                    "resource": self.resource_id,
                    "host": host,
                    "version": self.version,
                }

    def _urlretrieve(self, bag_url):
        return urlretrieve(bag_url)

    def fetch(self, spec, output_dir, yield_output=False, timeout=120):
        """Fetch and unpack a Hydroshare resource"""
        resource_id = spec["resource"]
        host = spec["host"]

        yield "Fetching HydroShare Resource {}.\n".format(resource_id)

        bag_url = "{}{}".format(host["django_irods"], resource_id)

        # bag downloads are prepared on demand and may need some time
        conn = self.urlopen(bag_url)
        total_wait_time = 0
        while (
            conn.getcode() == 200
            and conn.info().get_content_type() != "application/zip"
        ):
            wait_time = 10
            total_wait_time += wait_time
            if total_wait_time > timeout:
                msg = "Bag taking too long to prepare, exiting now, try again later."
                yield msg
                raise ContentProviderException(msg)
            yield "Bag is being prepared, requesting again in {} seconds.\n".format(
                wait_time
            )
            time.sleep(wait_time)
            conn = self.urlopen(bag_url)
        if conn.getcode() != 200:
            msg = "Failed to download bag. status code {}.\n".format(conn.getcode())
            yield msg
            raise ContentProviderException(msg)
        # Bag creation seems to need a small time buffer after it says it's ready.
        time.sleep(1)
        filehandle, _ = self._urlretrieve(bag_url)
        zip_file_object = zipfile.ZipFile(filehandle, "r")
        yield "Downloaded, unpacking contents.\n"
        zip_file_object.extractall("temp")
        # resources store the contents in the data/contents directory, which is all we want to keep
        contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
        files = os.listdir(contents_dir)
        for f in files:
            shutil.move(os.path.join(contents_dir, f), output_dir)
        yield "Finished, cleaning up.\n"
        shutil.rmtree("temp")

    @property
    def content_id(self):
        """The HydroShare resource ID"""
        return "{}.v{}".format(self.resource_id, self.version)
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`import zipfile`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`import os`
			`import shutil`
allow for bag creation 2019-08-23 16:56:52 +00:00			`import time`
raise exceptions during failure and add modify date version 2019-09-25 20:00:47 +00:00			`import json`
use datetime for iso parsing 2019-09-30 22:59:01 +00:00			`import datetime`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00
use request.urlretrieve 2019-08-22 22:41:31 +00:00			`from urllib.request import urlopen, Request, urlretrieve`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`from urllib.error import HTTPError`

update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`from .doi import DoiProvider`
raise exceptions during failure and add modify date version 2019-09-25 20:00:47 +00:00			`from .base import ContentProviderException`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`from ..utils import normalize_doi, is_doi`


update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`class Hydroshare(DoiProvider):`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`"""Provide contents of a Hydroshare resource."""`

			`def detect(self, doi, ref=None, extra_args=None):`
reformatting with lint 2019-09-30 19:50:24 +00:00			`"""Trigger this provider for things that resolve to a Hydroshare resource"""`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`hosts = [`
			`{`
reformatting with lint 2019-09-30 19:50:24 +00:00			`"hostname": [`
			`"https://www.hydroshare.org/resource/",`
			`"http://www.hydroshare.org/resource/",`
			`],`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",`
reformatting with lint 2019-09-30 19:50:24 +00:00			`"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",`
			`}`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`]`

raise exceptions during failure and add modify date version 2019-09-25 20:00:47 +00:00			`def fetch_version(resource_id, host):`
add hydroshare resource versioning and raise exception fetch fails 2019-09-25 20:37:19 +00:00			`"""Fetch resource modified date and convert to epoch"""`
reformatting with lint 2019-09-30 19:50:24 +00:00			`json_response = json.loads(`
			`self.urlopen(host["version"].format(self.resource_id)).read()`
			`)`
			`date = next(`
			`item for item in json_response["dates"] if item["type"] == "modified"`
			`)["start_date"]`
use datetime for iso parsing 2019-09-30 22:59:01 +00:00			`return datetime.strptime(date, "%Y-%m-%dT%H:%M:%S").timestamp()`
raise exceptions during failure and add modify date version 2019-09-25 20:00:47 +00:00
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`url = self.doi2url(doi)`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00
			`for host in hosts:`
			`if any([url.startswith(s) for s in host["hostname"]]):`
handle traliing slash in url 2019-08-22 23:24:52 +00:00			`self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]`
raise exceptions during failure and add modify date version 2019-09-25 20:00:47 +00:00			`self.version = fetch_version(self.resource_id, host)`
reformatting with lint 2019-09-30 19:50:24 +00:00			`return {`
			`"resource": self.resource_id,`
			`"host": host,`
			`"version": self.version,`
			`}`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00
fix _urlretrieve class method 2019-09-25 16:58:45 +00:00			`def _urlretrieve(self, bag_url):`
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`return urlretrieve(bag_url)`

			`def fetch(self, spec, output_dir, yield_output=False, timeout=120):`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`"""Fetch and unpack a Hydroshare resource"""`
			`resource_id = spec["resource"]`
			`host = spec["host"]`

			`yield "Fetching HydroShare Resource {}.\n".format(resource_id)`

			`bag_url = "{}{}".format(host["django_irods"], resource_id)`
allow for bag creation 2019-08-23 16:56:52 +00:00
			`# bag downloads are prepared on demand and may need some time`
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`conn = self.urlopen(bag_url)`
			`total_wait_time = 0`
reformatting with lint 2019-09-30 19:50:24 +00:00			`while (`
			`conn.getcode() == 200`
			`and conn.info().get_content_type() != "application/zip"`
			`):`
extend wait time and cleanup messaging 2019-08-23 17:09:26 +00:00			`wait_time = 10`
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`total_wait_time += wait_time`
			`if total_wait_time > timeout:`
raise exceptions during failure and add modify date version 2019-09-25 20:00:47 +00:00			`msg = "Bag taking too long to prepare, exiting now, try again later."`
			`yield msg`
			`raise ContentProviderException(msg)`
reformatting with lint 2019-09-30 19:50:24 +00:00			`yield "Bag is being prepared, requesting again in {} seconds.\n".format(`
			`wait_time`
			`)`
extend wait time and cleanup messaging 2019-08-23 17:09:26 +00:00			`time.sleep(wait_time)`
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`conn = self.urlopen(bag_url)`
			`if conn.getcode() != 200:`
raise exceptions during failure and add modify date version 2019-09-25 20:00:47 +00:00			`msg = "Failed to download bag. status code {}.\n".format(conn.getcode())`
			`yield msg`
			`raise ContentProviderException(msg)`
update messaging 2019-08-23 17:16:39 +00:00			`# Bag creation seems to need a small time buffer after it says it's ready.`
			`time.sleep(1)`
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`filehandle, _ = self._urlretrieve(bag_url)`
reformatting with lint 2019-09-30 19:50:24 +00:00			`zip_file_object = zipfile.ZipFile(filehandle, "r")`
update messaging 2019-08-23 17:16:39 +00:00			`yield "Downloaded, unpacking contents.\n"`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`zip_file_object.extractall("temp")`
allow for bag creation 2019-08-23 16:56:52 +00:00			`# resources store the contents in the data/contents directory, which is all we want to keep`
extract from contents directory 2019-08-22 23:10:55 +00:00			`contents_dir = os.path.join("temp", self.resource_id, "data", "contents")`
			`files = os.listdir(contents_dir)`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`for f in files:`
extract from contents directory 2019-08-22 23:10:55 +00:00			`shutil.move(os.path.join(contents_dir, f), output_dir)`
update messaging 2019-08-23 17:16:39 +00:00			`yield "Finished, cleaning up.\n"`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`shutil.rmtree("temp")`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00
			`@property`
			`def content_id(self):`
			`"""The HydroShare resource ID"""`
raise exceptions during failure and add modify date version 2019-09-25 20:00:47 +00:00			`return "{}.v{}".format(self.resource_id, self.version)`