kopia lustrzana https://github.com/jupyterhub/repo2docker
98 wiersze
3.4 KiB
Python
98 wiersze
3.4 KiB
Python
import os
|
|
import re
|
|
import json
|
|
import shutil
|
|
|
|
from os import makedirs
|
|
from os import path
|
|
from urllib.request import Request
|
|
from urllib.error import HTTPError
|
|
from zipfile import is_zipfile
|
|
|
|
from .doi import DoiProvider
|
|
from ..utils import copytree, deep_get
|
|
|
|
|
|
class Figshare(DoiProvider):
|
|
"""Provide contents of a Figshare article.
|
|
|
|
See https://docs.figshare.com/#public_article for API docs.
|
|
|
|
Examples:
|
|
- https://doi.org/10.6084/m9.figshare.9782777
|
|
- https://doi.org/10.6084/m9.figshare.9782777.v2
|
|
- https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI)
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.hosts = [
|
|
{
|
|
"hostname": [
|
|
"https://figshare.com/articles/",
|
|
"http://figshare.com/articles/",
|
|
"https://figshare.com/account/articles/",
|
|
],
|
|
"api": "https://api.figshare.com/v2/articles/",
|
|
"filepath": "files",
|
|
"filename": "name",
|
|
"download": "download_url",
|
|
}
|
|
]
|
|
|
|
url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/)?(\d+)?")
|
|
|
|
def detect(self, doi, ref=None, extra_args=None):
|
|
"""Trigger this provider for things that resolve to a Figshare article"""
|
|
# We need the hostname (url where records are), api url (for metadata),
|
|
# filepath (path to files in metadata), filename (path to filename in
|
|
# metadata), download (path to file download URL), and type (path to item type in metadata)
|
|
|
|
url = self.doi2url(doi)
|
|
|
|
for host in self.hosts:
|
|
if any([url.startswith(s) for s in host["hostname"]]):
|
|
match = self.url_regex.match(url)
|
|
if match:
|
|
self.article_id = match.groups()[2]
|
|
self.article_version = match.groups()[4]
|
|
if not self.article_version:
|
|
self.article_version = "1"
|
|
return {
|
|
"article": self.article_id,
|
|
"host": host,
|
|
"version": self.article_version,
|
|
}
|
|
else:
|
|
return None
|
|
|
|
def fetch(self, spec, output_dir, yield_output=False):
|
|
"""Fetch and unpack a Figshare article"""
|
|
article_id = spec["article"]
|
|
article_version = spec["version"]
|
|
host = spec["host"]
|
|
|
|
yield "Fetching Figshare article {} in version {}.\n".format(
|
|
article_id, article_version
|
|
)
|
|
req = Request(
|
|
"{}{}/versions/{}".format(host["api"], article_id, article_version),
|
|
headers={"accept": "application/json"},
|
|
)
|
|
resp = self.urlopen(req)
|
|
|
|
article = json.loads(resp.read().decode("utf-8"))
|
|
|
|
files = deep_get(article, host["filepath"])
|
|
# only fetch files where is_link_only: False
|
|
files = [file for file in files if not file["is_link_only"]]
|
|
only_one_file = len(files) == 1
|
|
for file_ref in files:
|
|
unzip = file_ref["name"].endswith(".zip") and only_one_file
|
|
for line in self.fetch_file(file_ref, host, output_dir, unzip):
|
|
yield line
|
|
|
|
@property
|
|
def content_id(self):
|
|
"""The Figshare article ID"""
|
|
return "{}.v{}".format(self.article_id, self.article_version)
|