repo2docker/repo2docker/contentproviders/figshare.py

96 wiersze
3.4 KiB
Python

import json
import os
import re
import shutil
from os import makedirs, path
from urllib.error import HTTPError
from urllib.request import Request
from zipfile import is_zipfile
from ..utils import copytree, deep_get
from .doi import DoiProvider
class Figshare(DoiProvider):
"""Provide contents of a Figshare article.
See https://docs.figshare.com/#public_article for API docs.
Examples:
- https://doi.org/10.6084/m9.figshare.9782777
- https://doi.org/10.6084/m9.figshare.9782777.v2
- https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI)
"""
def __init__(self):
super().__init__()
self.hosts = [
{
"hostname": [
"https://figshare.com/articles/",
"http://figshare.com/articles/",
"https://figshare.com/account/articles/",
],
"api": "https://api.figshare.com/v2/articles/",
"filepath": "files",
"filename": "name",
"download": "download_url",
}
]
# We may need to add other item types in future, see
# https://github.com/jupyterhub/repo2docker/pull/1001#issuecomment-760107436
# for a list
url_regex = re.compile(r"(.*)/articles/(code/|dataset/)?([^/]+)/(\d+)(/)?(\d+)?")
def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Figshare article"""
# We need the hostname (url where records are), api url (for metadata),
# filepath (path to files in metadata), filename (path to filename in
# metadata), download (path to file download URL), and type (path to item type in metadata)
url = self.doi2url(doi)
for host in self.hosts:
if any([url.startswith(s) for s in host["hostname"]]):
match = self.url_regex.match(url)
if match:
self.article_id = match.groups()[3]
self.article_version = match.groups()[5]
if not self.article_version:
self.article_version = "1"
return {
"article": self.article_id,
"host": host,
"version": self.article_version,
}
else:
return None
def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Figshare article"""
article_id = spec["article"]
article_version = spec["version"]
host = spec["host"]
yield f"Fetching Figshare article {article_id} in version {article_version}.\n"
resp = self.urlopen(
f'{host["api"]}{article_id}/versions/{article_version}',
headers={"accept": "application/json"},
)
article = resp.json()
files = deep_get(article, host["filepath"])
# only fetch files where is_link_only: False
files = [file for file in files if not file["is_link_only"]]
only_one_file = len(files) == 1
for file_ref in files:
unzip = file_ref["name"].endswith(".zip") and only_one_file
yield from self.fetch_file(file_ref, host, output_dir, unzip)
@property
def content_id(self):
"""The Figshare article ID"""
return f"{self.article_id}.v{self.article_version}"