repo2docker/repo2docker/contentproviders/figshare.py

import os
import re
import json
import shutil

from os import makedirs
from os import path
from urllib.request import Request
from urllib.error import HTTPError
from zipfile import is_zipfile

from .doi import DoiProvider
from ..utils import copytree, deep_get


class Figshare(DoiProvider):
    """Provide contents of a Figshare article.
    
    See https://docs.figshare.com/#public_article for API docs.

    Examples:
      - https://doi.org/10.6084/m9.figshare.9782777
      - https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI)
    """

    hosts = [
        {
            "hostname": [
                "https://figshare.com/articles/",
                "http://figshare.com/articles/",
                "https://figshare.com/account/articles/",
            ],
            "api": "https://api.figshare.com/v2/articles/",
            "filepath": "files",
            "filename": "name",
            "download": "download_url",
        }
    ]

    url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/\d)?")

    def detect(self, doi, ref=None, extra_args=None):
        """Trigger this provider for things that resolve to a Figshare article"""
        # We need the hostname (url where records are), api url (for metadata),
        # filepath (path to files in metadata), filename (path to filename in
        # metadata), download (path to file download URL), and type (path to item type in metadata)

        url = self.doi2url(doi)

        for host in self.hosts:
            if any([url.startswith(s) for s in host["hostname"]]):
                match = self.url_regex.match(url)
                if match:
                    self.article_id = match.groups()[2]
                    return {"article": self.article_id, "host": host}
                else:
                    return None

    def fetch(self, spec, output_dir, yield_output=False):
        """Fetch and unpack a Figshare article"""
        article_id = spec["article"]
        host = spec["host"]

        yield "Fetching Figshare article {}.\n".format(article_id)
        req = Request(
            "{}{}".format(host["api"], article_id),
            headers={"accept": "application/json"},
        )
        resp = self.urlopen(req)

        article = json.loads(resp.read().decode("utf-8"))

        files = deep_get(article, host["filepath"])
        # only fetch files where is_link_only: False
        files = [file for file in files if not file["is_link_only"]]
        only_one_file = len(files) == 1
        for file_ref in files:
            unzip = file_ref["name"].endswith(".zip") and only_one_file
            for line in self.fetch_file(file_ref, host, output_dir, unzip):
                yield line

    @property
    def content_id(self):
        """The Figshare article ID"""
        return self.article_id
add Figshare content provider moving common functions to a DoiProvider 2019-09-08 09:53:42 +00:00			`import os`
			`import re`
			`import json`
			`import shutil`

			`from os import makedirs`
			`from os import path`
			`from urllib.request import Request`
			`from urllib.error import HTTPError`
			`from zipfile import is_zipfile`

			`from .doi import DoiProvider`
			`from ..utils import copytree, deep_get`


			`class Figshare(DoiProvider):`
			`"""Provide contents of a Figshare article.`

			`See https://docs.figshare.com/#public_article for API docs.`

			`Examples:`
			`- https://doi.org/10.6084/m9.figshare.9782777`
			`- https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI)`
			`"""`

			`hosts = [`
			`{`
			`"hostname": [`
			`"https://figshare.com/articles/",`
			`"http://figshare.com/articles/",`
			`"https://figshare.com/account/articles/",`
			`],`
			`"api": "https://api.figshare.com/v2/articles/",`
			`"filepath": "files",`
			`"filename": "name",`
			`"download": "download_url",`
			`}`
			`]`

			`url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/\d)?")`

			`def detect(self, doi, ref=None, extra_args=None):`
			`"""Trigger this provider for things that resolve to a Figshare article"""`
			`# We need the hostname (url where records are), api url (for metadata),`
			`# filepath (path to files in metadata), filename (path to filename in`
			`# metadata), download (path to file download URL), and type (path to item type in metadata)`

			`url = self.doi2url(doi)`

			`for host in self.hosts:`
			`if any([url.startswith(s) for s in host["hostname"]]):`
			`match = self.url_regex.match(url)`
			`if match:`
			`self.article_id = match.groups()[2]`
			`return {"article": self.article_id, "host": host}`
			`else:`
			`return None`

			`def fetch(self, spec, output_dir, yield_output=False):`
			`"""Fetch and unpack a Figshare article"""`
			`article_id = spec["article"]`
			`host = spec["host"]`

			`yield "Fetching Figshare article {}.\n".format(article_id)`
			`req = Request(`
			`"{}{}".format(host["api"], article_id),`
			`headers={"accept": "application/json"},`
			`)`
			`resp = self.urlopen(req)`

			`article = json.loads(resp.read().decode("utf-8"))`

			`files = deep_get(article, host["filepath"])`
			`# only fetch files where is_link_only: False`
			`files = [file for file in files if not file["is_link_only"]]`
			`only_one_file = len(files) == 1`
			`for file_ref in files:`
			`unzip = file_ref["name"].endswith(".zip") and only_one_file`
			`for line in self.fetch_file(file_ref, host, output_dir, unzip):`
			`yield line`

			`@property`
			`def content_id(self):`
			`"""The Figshare article ID"""`
			`return self.article_id`