repo2docker/repo2docker/contentproviders/hydroshare.py

import json
import os
import shutil
import time
import zipfile
from datetime import datetime, timedelta, timezone
from urllib.request import urlretrieve

from .base import ContentProviderException
from .doi import DoiProvider


class Hydroshare(DoiProvider):
    """Provide contents of a Hydroshare resource."""

    def _fetch_version(self, host):
        """Fetch resource modified date and convert to epoch"""
        json_response = self.urlopen(host["version"].format(self.resource_id)).json()
        date = next(
            item for item in json_response["dates"] if item["type"] == "modified"
        )["start_date"]
        # Hydroshare timestamp always returns the same timezone, so strip it
        date = date.split(".")[0]
        parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
        epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
        # truncate the timestamp
        return str(int(epoch))

    def detect(self, doi, ref=None, extra_args=None):
        """Trigger this provider for things that resolve to a Hydroshare resource"""
        hosts = [
            {
                "hostname": [
                    "https://www.hydroshare.org/resource/",
                    "http://www.hydroshare.org/resource/",
                ],
                "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
                "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
            }
        ]
        url = self.doi2url(doi)

        for host in hosts:
            if any([url.startswith(s) for s in host["hostname"]]):
                self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
                self.version = self._fetch_version(host)
                return {
                    "resource": self.resource_id,
                    "host": host,
                    "version": self.version,
                }

    def _urlretrieve(self, bag_url):
        return urlretrieve(bag_url)

    def fetch(self, spec, output_dir, yield_output=False, timeout=120):
        """Fetch and unpack a Hydroshare resource"""
        resource_id = spec["resource"]
        host = spec["host"]

        bag_url = f'{host["django_irods"]}{resource_id}'

        yield f"Downloading {bag_url}.\n"

        # bag downloads are prepared on demand and may need some time
        conn = self.urlopen(bag_url)
        total_wait_time = 0
        while (
            conn.status_code == 200
            and conn.headers["content-type"] != "application/zip"
        ):
            wait_time = 10
            total_wait_time += wait_time
            if total_wait_time > timeout:
                msg = "Bag taking too long to prepare, exiting now, try again later."
                yield msg
                raise ContentProviderException(msg)
            yield f"Bag is being prepared, requesting again in {wait_time} seconds.\n"
            time.sleep(wait_time)
            conn = self.urlopen(bag_url)
        if conn.status_code != 200:
            msg = f"Failed to download bag. status code {conn.status_code}.\n"
            yield msg
            raise ContentProviderException(msg)
        # Bag creation seems to need a small time buffer after it says it's ready.
        time.sleep(1)
        filehandle, _ = self._urlretrieve(bag_url)
        zip_file_object = zipfile.ZipFile(filehandle, "r")
        yield "Downloaded, unpacking contents.\n"
        zip_file_object.extractall("temp")
        # resources store the contents in the data/contents directory, which is all we want to keep
        contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
        files = os.listdir(contents_dir)
        for f in files:
            shutil.move(os.path.join(contents_dir, f), output_dir)
        yield "Finished, cleaning up.\n"
        shutil.rmtree("temp")

    @property
    def content_id(self):
        """The HydroShare resource ID"""
        return f"{self.resource_id}.v{self.version}"
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2022-10-31 22:32:14 +00:00			`import json`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`import os`
			`import shutil`
allow for bag creation 2019-08-23 16:56:52 +00:00			`import time`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2022-10-31 22:32:14 +00:00			`import zipfile`
			`from datetime import datetime, timedelta, timezone`
update tests and code to follow best practices pointed out in review 2019-12-06 23:22:18 +00:00			`from urllib.request import urlretrieve`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00
raise exceptions during failure and add modify date version 2019-09-25 20:00:47 +00:00			`from .base import ContentProviderException`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2022-10-31 22:32:14 +00:00			`from .doi import DoiProvider`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00

update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`class Hydroshare(DoiProvider):`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`"""Provide contents of a Hydroshare resource."""`

update tests and code to follow best practices pointed out in review 2019-12-06 23:22:18 +00:00			`def _fetch_version(self, host):`
			`"""Fetch resource modified date and convert to epoch"""`
Replace urllib by requests in contentproviders requests is globally simpler to use, and more and more people are more familiar with this later than urllib. 2020-12-10 17:55:14 +00:00			`json_response = self.urlopen(host["version"].format(self.resource_id)).json()`
update tests and code to follow best practices pointed out in review 2019-12-06 23:22:18 +00:00			`date = next(`
			`item for item in json_response["dates"] if item["type"] == "modified"`
			`)["start_date"]`
			`# Hydroshare timestamp always returns the same timezone, so strip it`
			`date = date.split(".")[0]`
			`parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")`
			`epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()`
			`# truncate the timestamp`
			`return str(int(epoch))`

initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`def detect(self, doi, ref=None, extra_args=None):`
reformatting with lint 2019-09-30 19:50:24 +00:00			`"""Trigger this provider for things that resolve to a Hydroshare resource"""`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`hosts = [`
			`{`
reformatting with lint 2019-09-30 19:50:24 +00:00			`"hostname": [`
			`"https://www.hydroshare.org/resource/",`
			`"http://www.hydroshare.org/resource/",`
			`],`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",`
reformatting with lint 2019-09-30 19:50:24 +00:00			`"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",`
			`}`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`]`
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`url = self.doi2url(doi)`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00
			`for host in hosts:`
			`if any([url.startswith(s) for s in host["hostname"]]):`
handle traliing slash in url 2019-08-22 23:24:52 +00:00			`self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]`
update tests and code to follow best practices pointed out in review 2019-12-06 23:22:18 +00:00			`self.version = self._fetch_version(host)`
reformatting with lint 2019-09-30 19:50:24 +00:00			`return {`
			`"resource": self.resource_id,`
			`"host": host,`
			`"version": self.version,`
			`}`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00
fix _urlretrieve class method 2019-09-25 16:58:45 +00:00			`def _urlretrieve(self, bag_url):`
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`return urlretrieve(bag_url)`

			`def fetch(self, spec, output_dir, yield_output=False, timeout=120):`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`"""Fetch and unpack a Hydroshare resource"""`
			`resource_id = spec["resource"]`
			`host = spec["host"]`

refactor: manually add transitions to f-strings 2022-10-23 17:18:37 +00:00			`bag_url = f'{host["django_irods"]}{resource_id}'`
allow for bag creation 2019-08-23 16:56:52 +00:00
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2022-10-23 16:09:11 +00:00			`yield f"Downloading {bag_url}.\n"`
update downloading message to include full url 2019-10-14 19:56:29 +00:00
allow for bag creation 2019-08-23 16:56:52 +00:00			`# bag downloads are prepared on demand and may need some time`
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`conn = self.urlopen(bag_url)`
			`total_wait_time = 0`
reformatting with lint 2019-09-30 19:50:24 +00:00			`while (`
Fix regression in hydroshare introduced after moving to requests 2021-04-01 13:31:19 +00:00			`conn.status_code == 200`
			`and conn.headers["content-type"] != "application/zip"`
reformatting with lint 2019-09-30 19:50:24 +00:00			`):`
extend wait time and cleanup messaging 2019-08-23 17:09:26 +00:00			`wait_time = 10`
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`total_wait_time += wait_time`
			`if total_wait_time > timeout:`
raise exceptions during failure and add modify date version 2019-09-25 20:00:47 +00:00			`msg = "Bag taking too long to prepare, exiting now, try again later."`
			`yield msg`
			`raise ContentProviderException(msg)`
refactor: manually add transitions to f-strings 2022-10-23 17:18:37 +00:00			`yield f"Bag is being prepared, requesting again in {wait_time} seconds.\n"`
extend wait time and cleanup messaging 2019-08-23 17:09:26 +00:00			`time.sleep(wait_time)`
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`conn = self.urlopen(bag_url)`
Fix regression in hydroshare introduced after moving to requests 2021-04-01 13:31:19 +00:00			`if conn.status_code != 200:`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2022-10-23 16:09:11 +00:00			`msg = f"Failed to download bag. status code {conn.status_code}.\n"`
raise exceptions during failure and add modify date version 2019-09-25 20:00:47 +00:00			`yield msg`
			`raise ContentProviderException(msg)`
update messaging 2019-08-23 17:16:39 +00:00			`# Bag creation seems to need a small time buffer after it says it's ready.`
			`time.sleep(1)`
update hydroshare content provider to doi and add tests 2019-09-25 15:59:44 +00:00			`filehandle, _ = self._urlretrieve(bag_url)`
reformatting with lint 2019-09-30 19:50:24 +00:00			`zip_file_object = zipfile.ZipFile(filehandle, "r")`
update messaging 2019-08-23 17:16:39 +00:00			`yield "Downloaded, unpacking contents.\n"`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`zip_file_object.extractall("temp")`
allow for bag creation 2019-08-23 16:56:52 +00:00			`# resources store the contents in the data/contents directory, which is all we want to keep`
extract from contents directory 2019-08-22 23:10:55 +00:00			`contents_dir = os.path.join("temp", self.resource_id, "data", "contents")`
			`files = os.listdir(contents_dir)`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`for f in files:`
extract from contents directory 2019-08-22 23:10:55 +00:00			`shutil.move(os.path.join(contents_dir, f), output_dir)`
update messaging 2019-08-23 17:16:39 +00:00			`yield "Finished, cleaning up.\n"`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`shutil.rmtree("temp")`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00
			`@property`
			`def content_id(self):`
			`"""The HydroShare resource ID"""`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2022-10-23 16:09:11 +00:00			`return f"{self.resource_id}.v{self.version}"`