repo2docker/repo2docker/contentproviders/hydroshare.py

import zipfile
import os
import shutil
import time

from urllib.request import urlopen, Request, urlretrieve
from urllib.error import HTTPError

from .base import ContentProvider
from ..utils import normalize_doi, is_doi


class Hydroshare(ContentProvider):
    """Provide contents of a Hydroshare resource."""

    def _urlopen(self, req, headers=None):
        """A urlopen() helper"""
        # someone passed a string, not a request
        if not isinstance(req, Request):
            req = Request(req)

        #req.add_header("User-Agent", "repo2docker {}".format(__version__))
        if headers is not None:
            for key, value in headers.items():
                req.add_header(key, value)

        return urlopen(req)

    def _doi2url(self, doi):
        # Transform a DOI to a URL
        # If not a doi, assume we have a URL and return
        if is_doi(doi):
            doi = normalize_doi(doi)

            try:
                resp = self._urlopen("https://doi.org/{}".format(doi))
            # If the DOI doesn't resolve, just return URL
            except HTTPError:
                return doi
            return resp.url
        else:
            # Just return what is actulally just a URL
            return doi

    def detect(self, doi, ref=None, extra_args=None):
        """Trigger this provider for things that resolve to a Zenodo/Invenio record"""
        # We need the hostname (url where records are), api url (for metadata),
        # filepath (path to files in metadata), filename (path to filename in
        # metadata), download (path to file download URL), and type (path to item type in metadata)
        hosts = [
            {
                "hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"],
                "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
            },
        ]

        url = self._doi2url(doi)

        for host in hosts:
            if any([url.startswith(s) for s in host["hostname"]]):
                self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
                return {"resource": self.resource_id, "host": host}

    def fetch(self, spec, output_dir, yield_output=False):
        """Fetch and unpack a Hydroshare resource"""
        resource_id = spec["resource"]
        host = spec["host"]

        yield "Fetching HydroShare Resource {}.\n".format(resource_id)

        bag_url = "{}{}".format(host["django_irods"], resource_id)

        # bag downloads are prepared on demand and may need some time
        conn = urlopen(bag_url)
        while conn.info().get_content_type() != "application/zip":
            if conn.getcode() != 200:
                yield "Failed to download bag. status code {}.\n".format(conn.getcode())
                return
            wait_time = 10
            yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time)
            time.sleep(wait_time)
            conn = urlopen(bag_url)

        # Bag creation seems to need a small time buffer after it says it's ready.
        time.sleep(1)
        filehandle, _ = urlretrieve(bag_url)
        zip_file_object = zipfile.ZipFile(filehandle, 'r')
        yield "Downloaded, unpacking contents.\n"
        zip_file_object.extractall("temp")
        # resources store the contents in the data/contents directory, which is all we want to keep
        contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
        files = os.listdir(contents_dir)
        for f in files:
            shutil.move(os.path.join(contents_dir, f), output_dir)
        yield "Finished, cleaning up.\n"
        shutil.rmtree("temp")

    @property
    def content_id(self):
        """The HydroShare resource ID"""
        return self.resource_id
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`import zipfile`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`import os`
			`import shutil`
allow for bag creation 2019-08-23 16:56:52 +00:00			`import time`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00
use request.urlretrieve 2019-08-22 22:41:31 +00:00			`from urllib.request import urlopen, Request, urlretrieve`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`from urllib.error import HTTPError`

			`from .base import ContentProvider`
			`from ..utils import normalize_doi, is_doi`


			`class Hydroshare(ContentProvider):`
			`"""Provide contents of a Hydroshare resource."""`

			`def _urlopen(self, req, headers=None):`
			`"""A urlopen() helper"""`
			`# someone passed a string, not a request`
			`if not isinstance(req, Request):`
			`req = Request(req)`

			`#req.add_header("User-Agent", "repo2docker {}".format(__version__))`
			`if headers is not None:`
			`for key, value in headers.items():`
			`req.add_header(key, value)`

			`return urlopen(req)`

			`def _doi2url(self, doi):`
			`# Transform a DOI to a URL`
			`# If not a doi, assume we have a URL and return`
			`if is_doi(doi):`
			`doi = normalize_doi(doi)`

			`try:`
			`resp = self._urlopen("https://doi.org/{}".format(doi))`
			`# If the DOI doesn't resolve, just return URL`
			`except HTTPError:`
			`return doi`
			`return resp.url`
			`else:`
			`# Just return what is actulally just a URL`
			`return doi`

			`def detect(self, doi, ref=None, extra_args=None):`
			`"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""`
			`# We need the hostname (url where records are), api url (for metadata),`
			`# filepath (path to files in metadata), filename (path to filename in`
			`# metadata), download (path to file download URL), and type (path to item type in metadata)`
			`hosts = [`
			`{`
			`"hostname": ["https://www.hydroshare.org/resource/", "http://www.hydroshare.org/resource/"],`
			`"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",`
			`},`
			`]`

			`url = self._doi2url(doi)`

			`for host in hosts:`
			`if any([url.startswith(s) for s in host["hostname"]]):`
handle traliing slash in url 2019-08-22 23:24:52 +00:00			`self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`return {"resource": self.resource_id, "host": host}`

			`def fetch(self, spec, output_dir, yield_output=False):`
			`"""Fetch and unpack a Hydroshare resource"""`
			`resource_id = spec["resource"]`
			`host = spec["host"]`

			`yield "Fetching HydroShare Resource {}.\n".format(resource_id)`

			`bag_url = "{}{}".format(host["django_irods"], resource_id)`
allow for bag creation 2019-08-23 16:56:52 +00:00
			`# bag downloads are prepared on demand and may need some time`
			`conn = urlopen(bag_url)`
actually reconnect 2019-08-23 17:06:02 +00:00			`while conn.info().get_content_type() != "application/zip":`
allow for bag creation 2019-08-23 16:56:52 +00:00			`if conn.getcode() != 200:`
extend wait time and cleanup messaging 2019-08-23 17:09:26 +00:00			`yield "Failed to download bag. status code {}.\n".format(conn.getcode())`
allow for bag creation 2019-08-23 16:56:52 +00:00			`return`
extend wait time and cleanup messaging 2019-08-23 17:09:26 +00:00			`wait_time = 10`
			`yield "Bag is being prepared, requesting again in {} seconds.\n".format(wait_time)`
			`time.sleep(wait_time)`
actually reconnect 2019-08-23 17:06:02 +00:00			`conn = urlopen(bag_url)`
allow for bag creation 2019-08-23 16:56:52 +00:00
update messaging 2019-08-23 17:16:39 +00:00			`# Bag creation seems to need a small time buffer after it says it's ready.`
			`time.sleep(1)`
use request.urlretrieve 2019-08-22 22:41:31 +00:00			`filehandle, _ = urlretrieve(bag_url)`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00			`zip_file_object = zipfile.ZipFile(filehandle, 'r')`
update messaging 2019-08-23 17:16:39 +00:00			`yield "Downloaded, unpacking contents.\n"`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`zip_file_object.extractall("temp")`
allow for bag creation 2019-08-23 16:56:52 +00:00			`# resources store the contents in the data/contents directory, which is all we want to keep`
extract from contents directory 2019-08-22 23:10:55 +00:00			`contents_dir = os.path.join("temp", self.resource_id, "data", "contents")`
			`files = os.listdir(contents_dir)`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`for f in files:`
extract from contents directory 2019-08-22 23:10:55 +00:00			`shutil.move(os.path.join(contents_dir, f), output_dir)`
update messaging 2019-08-23 17:16:39 +00:00			`yield "Finished, cleaning up.\n"`
only use data/contents directory 2019-08-22 23:00:55 +00:00			`shutil.rmtree("temp")`
initial checkin of hydroshare content provider 2019-08-22 22:30:48 +00:00
			`@property`
			`def content_id(self):`
			`"""The HydroShare resource ID"""`
			`return self.resource_id`