repo2docker/repo2docker/contentproviders/figshare.py

98 wiersze
3.4 KiB
Python

import os
import re
import json
import shutil
from os import makedirs
from os import path
from urllib.request import Request
from urllib.error import HTTPError
from zipfile import is_zipfile
from .doi import DoiProvider
from ..utils import copytree, deep_get
class Figshare(DoiProvider):
"""Provide contents of a Figshare article.
See https://docs.figshare.com/#public_article for API docs.
Examples:
- https://doi.org/10.6084/m9.figshare.9782777
- https://doi.org/10.6084/m9.figshare.9782777.v2
- https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI)
"""
def __init__(self):
super().__init__()
self.hosts = [
{
"hostname": [
"https://figshare.com/articles/",
"http://figshare.com/articles/",
"https://figshare.com/account/articles/",
],
"api": "https://api.figshare.com/v2/articles/",
"filepath": "files",
"filename": "name",
"download": "download_url",
}
]
# We may need to add other item types in future, see
# https://github.com/jupyterhub/repo2docker/pull/1001#issuecomment-760107436
# for a list
url_regex = re.compile(r"(.*)/articles/(code/|dataset/)?([^/]+)/(\d+)(/)?(\d+)?")
def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Figshare article"""
# We need the hostname (url where records are), api url (for metadata),
# filepath (path to files in metadata), filename (path to filename in
# metadata), download (path to file download URL), and type (path to item type in metadata)
url = self.doi2url(doi)
for host in self.hosts:
if any([url.startswith(s) for s in host["hostname"]]):
match = self.url_regex.match(url)
if match:
self.article_id = match.groups()[3]
self.article_version = match.groups()[5]
if not self.article_version:
self.article_version = "1"
return {
"article": self.article_id,
"host": host,
"version": self.article_version,
}
else:
return None
def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Figshare article"""
article_id = spec["article"]
article_version = spec["version"]
host = spec["host"]
yield f"Fetching Figshare article {article_id} in version {article_version}.\n"
resp = self.urlopen(
f'{host["api"]}{article_id}/versions/{article_version}',
headers={"accept": "application/json"},
)
article = resp.json()
files = deep_get(article, host["filepath"])
# only fetch files where is_link_only: False
files = [file for file in files if not file["is_link_only"]]
only_one_file = len(files) == 1
for file_ref in files:
unzip = file_ref["name"].endswith(".zip") and only_one_file
yield from self.fetch_file(file_ref, host, output_dir, unzip)
@property
def content_id(self):
"""The Figshare article ID"""
return f"{self.article_id}.v{self.article_version}"