diff --git a/docs/source/contributing/roadmap.md b/docs/source/contributing/roadmap.md index c14ea387..24af47a9 100644 --- a/docs/source/contributing/roadmap.md +++ b/docs/source/contributing/roadmap.md @@ -83,4 +83,3 @@ time there is no active plan for an item. The project would like to find the resources and time to discuss and then execute these ideas. * support execution on a remote host (with more resources than available locally) via the command-line * add support for using ZIP files as the repo (`repo2docker https://example.com/an-archive.zip`) this will give us access to several archives (like Zenodo) that expose things as ZIP files. -* add support for Zenodo (`repo2docker 10.5281/zenodo.1476680`) so Zenodo software archives can be used as the source in addition to a git repository diff --git a/docs/source/usage.rst b/docs/source/usage.rst index b0fff1be..aae58cc7 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -12,7 +12,7 @@ Using ``repo2docker`` ``repo2docker`` can build a reproducible computational environment for any repository that follows :ref:`specification`. repo2docker is called with the URL of a Git repository, -a Zenodo DOI or a path to a local directory. It then +a DOI from Zenodo or Figshare, or a path to a local directory. It then performs these steps: 1. Inspects the repository for :ref:`configuration files `. These will be used to build diff --git a/repo2docker/app.py b/repo2docker/app.py index 755f0af1..ba0a0313 100644 --- a/repo2docker/app.py +++ b/repo2docker/app.py @@ -142,7 +142,12 @@ class Repo2Docker(Application): # detecting if something will successfully `git clone` is very hard if all # you can do is look at the path/URL to it. content_providers = List( - [contentproviders.Local, contentproviders.Zenodo, contentproviders.Git], + [ + contentproviders.Local, + contentproviders.Zenodo, + contentproviders.Figshare, + contentproviders.Git, + ], config=True, help=""" Ordered list by priority of ContentProviders to try in turn to fetch diff --git a/repo2docker/contentproviders/__init__.py b/repo2docker/contentproviders/__init__.py index d648731d..f7f9369b 100644 --- a/repo2docker/contentproviders/__init__.py +++ b/repo2docker/contentproviders/__init__.py @@ -1,3 +1,4 @@ from .git import Git from .base import Local from .zenodo import Zenodo +from .figshare import Figshare diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py new file mode 100644 index 00000000..90c087aa --- /dev/null +++ b/repo2docker/contentproviders/doi.py @@ -0,0 +1,95 @@ +import os +import json +import shutil +import logging + +from os import makedirs +from os import path +from urllib.request import urlopen, Request +from urllib.error import HTTPError +from zipfile import ZipFile, is_zipfile + +from .base import ContentProvider +from ..utils import copytree, deep_get +from ..utils import normalize_doi, is_doi +from .. import __version__ + + +class DoiProvider(ContentProvider): + """Provide contents of a repository identified by a DOI and some helper functions.""" + + def urlopen(self, req, headers=None): + """A urlopen() helper""" + # someone passed a string, not a request + if not isinstance(req, Request): + req = Request(req) + + req.add_header("User-Agent", "repo2docker {}".format(__version__)) + if headers is not None: + for key, value in headers.items(): + req.add_header(key, value) + + return urlopen(req) + + def doi2url(self, doi): + # Transform a DOI to a URL + # If not a doi, assume we have a URL and return + if is_doi(doi): + doi = normalize_doi(doi) + + try: + resp = self.urlopen("https://doi.org/{}".format(doi)) + # If the DOI doesn't resolve, just return URL + except HTTPError: + return doi + return resp.url + else: + # Just return what is actulally just a URL + return doi + + def fetch_file(self, file_ref, host, output_dir, unzip=False): + # the assumption is that `unzip=True` means that this is the only + # file related to a record + file_url = deep_get(file_ref, host["download"]) + fname = deep_get(file_ref, host["filename"]) + logging.debug("Downloading file {} as {}\n".format(file_url, fname)) + with self.urlopen(file_url) as src: + if path.dirname(fname): + sub_dir = path.join(output_dir, path.dirname(fname)) + if not path.exists(sub_dir): + yield "Creating {}\n".format(sub_dir) + makedirs(sub_dir, exist_ok=True) + + dst_fname = path.join(output_dir, fname) + with open(dst_fname, "wb") as dst: + yield "Fetching {}\n".format(fname) + shutil.copyfileobj(src, dst) + # first close the newly written file, then continue + # processing it + if unzip and is_zipfile(dst_fname): + yield "Extracting {}\n".format(fname) + zfile = ZipFile(dst_fname) + zfile.extractall(path=output_dir) + zfile.close() + + # delete downloaded file ... + os.remove(dst_fname) + # ... and any directories we might have created, + # in which case sub_dir will be defined + if path.dirname(fname): + shutil.rmtree(sub_dir) + + new_subdirs = os.listdir(output_dir) + # if there is only one new subdirectory move its contents + # to the top level directory + if len(new_subdirs) == 1: + d = new_subdirs[0] + copytree(path.join(output_dir, d), output_dir) + shutil.rmtree(path.join(output_dir, d)) + + yield "Fetched files: {}\n".format(os.listdir(output_dir)) + + @property + def content_id(self): + """The provider's ID for the record""" + return None diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py new file mode 100644 index 00000000..cb798625 --- /dev/null +++ b/repo2docker/contentproviders/figshare.py @@ -0,0 +1,85 @@ +import os +import re +import json +import shutil + +from os import makedirs +from os import path +from urllib.request import Request +from urllib.error import HTTPError +from zipfile import is_zipfile + +from .doi import DoiProvider +from ..utils import copytree, deep_get + + +class Figshare(DoiProvider): + """Provide contents of a Figshare article. + + See https://docs.figshare.com/#public_article for API docs. + + Examples: + - https://doi.org/10.6084/m9.figshare.9782777 + - https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI) + """ + + hosts = [ + { + "hostname": [ + "https://figshare.com/articles/", + "http://figshare.com/articles/", + "https://figshare.com/account/articles/", + ], + "api": "https://api.figshare.com/v2/articles/", + "filepath": "files", + "filename": "name", + "download": "download_url", + } + ] + + url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/\d)?") + + def detect(self, doi, ref=None, extra_args=None): + """Trigger this provider for things that resolve to a Figshare article""" + # We need the hostname (url where records are), api url (for metadata), + # filepath (path to files in metadata), filename (path to filename in + # metadata), download (path to file download URL), and type (path to item type in metadata) + + url = self.doi2url(doi) + + for host in self.hosts: + if any([url.startswith(s) for s in host["hostname"]]): + match = self.url_regex.match(url) + if match: + self.article_id = match.groups()[2] + return {"article": self.article_id, "host": host} + else: + return None + + def fetch(self, spec, output_dir, yield_output=False): + """Fetch and unpack a Figshare article""" + article_id = spec["article"] + host = spec["host"] + + yield "Fetching Figshare article {}.\n".format(article_id) + req = Request( + "{}{}".format(host["api"], article_id), + headers={"accept": "application/json"}, + ) + resp = self.urlopen(req) + + article = json.loads(resp.read().decode("utf-8")) + + files = deep_get(article, host["filepath"]) + # only fetch files where is_link_only: False + files = [file for file in files if not file["is_link_only"]] + only_one_file = len(files) == 1 + for file_ref in files: + unzip = file_ref["name"].endswith(".zip") and only_one_file + for line in self.fetch_file(file_ref, host, output_dir, unzip): + yield line + + @property + def content_id(self): + """The Figshare article ID""" + return self.article_id diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py index 426525ba..204acfc9 100644 --- a/repo2docker/contentproviders/zenodo.py +++ b/repo2docker/contentproviders/zenodo.py @@ -4,48 +4,16 @@ import shutil from os import makedirs from os import path -from urllib.request import urlopen, Request +from urllib.request import Request from urllib.error import HTTPError -from zipfile import ZipFile, is_zipfile -from .base import ContentProvider +from .doi import DoiProvider from ..utils import copytree, deep_get -from ..utils import normalize_doi, is_doi -from .. import __version__ -class Zenodo(ContentProvider): +class Zenodo(DoiProvider): """Provide contents of a Zenodo deposit.""" - def _urlopen(self, req, headers=None): - """A urlopen() helper""" - # someone passed a string, not a request - if not isinstance(req, Request): - req = Request(req) - - req.add_header("User-Agent", "repo2docker {}".format(__version__)) - if headers is not None: - for key, value in headers.items(): - req.add_header(key, value) - - return urlopen(req) - - def _doi2url(self, doi): - # Transform a DOI to a URL - # If not a doi, assume we have a URL and return - if is_doi(doi): - doi = normalize_doi(doi) - - try: - resp = self._urlopen("https://doi.org/{}".format(doi)) - # If the DOI doesn't resolve, just return URL - except HTTPError: - return doi - return resp.url - else: - # Just return what is actulally just a URL - return doi - def detect(self, doi, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Zenodo/Invenio record""" # We need the hostname (url where records are), api url (for metadata), @@ -73,7 +41,7 @@ class Zenodo(ContentProvider): }, ] - url = self._doi2url(doi) + url = self.doi2url(doi) for host in hosts: if any([url.startswith(s) for s in host["hostname"]]): @@ -90,53 +58,17 @@ class Zenodo(ContentProvider): "{}{}".format(host["api"], record_id), headers={"accept": "application/json"}, ) - resp = self._urlopen(req) + resp = self.urlopen(req) record = json.loads(resp.read().decode("utf-8")) - def _fetch(file_ref, unzip=False): - # the assumption is that `unzip=True` means that this is the only - # file related to the zenodo record - with self._urlopen(deep_get(file_ref, host["download"])) as src: - fname = deep_get(file_ref, host["filename"]) - if path.dirname(fname): - sub_dir = path.join(output_dir, path.dirname(fname)) - if not path.exists(sub_dir): - yield "Creating {}\n".format(sub_dir) - makedirs(sub_dir, exist_ok=True) - - dst_fname = path.join(output_dir, fname) - with open(dst_fname, "wb") as dst: - yield "Fetching {}\n".format(fname) - shutil.copyfileobj(src, dst) - # first close the newly written file, then continue - # processing it - if unzip and is_zipfile(dst_fname): - yield "Extracting {}\n".format(fname) - zfile = ZipFile(dst_fname) - zfile.extractall(path=output_dir) - zfile.close() - - # delete downloaded file ... - os.remove(dst_fname) - # ... and any directories we might have created, - # in which case sub_dir will be defined - if path.dirname(fname): - shutil.rmtree(sub_dir) - - new_subdirs = os.listdir(output_dir) - # if there is only one new subdirectory move its contents - # to the top level directory - if len(new_subdirs) == 1: - d = new_subdirs[0] - copytree(path.join(output_dir, d), output_dir) - shutil.rmtree(path.join(output_dir, d)) - is_software = deep_get(record, host["type"]).lower() == "software" files = deep_get(record, host["filepath"]) only_one_file = len(files) == 1 for file_ref in files: - for line in _fetch(file_ref, unzip=is_software and only_one_file): + for line in self.fetch_file( + file_ref, host, output_dir, is_software and only_one_file + ): yield line @property diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py new file mode 100644 index 00000000..4b2b2132 --- /dev/null +++ b/tests/unit/contentproviders/test_figshare.py @@ -0,0 +1,148 @@ +import json +import os +import pytest + +from contextlib import contextmanager +from io import BytesIO +from tempfile import TemporaryDirectory, NamedTemporaryFile +from unittest.mock import patch +from urllib.request import urlopen, Request +from zipfile import ZipFile + +from repo2docker.contentproviders import Figshare + + +def test_content_id(): + with patch.object(Figshare, "urlopen") as fake_urlopen: + fake_urlopen.return_value.url = "https://figshare.com/articles/title/9782777" + fig = Figshare() + fig.detect("10.6084/m9.figshare.9782777") + assert fig.content_id == "9782777" + + +test_dois_links = [ + ("10.6084/m9.figshare.9782777", {"host": Figshare.hosts[0], "article": "9782777"}), + ( + "10.6084/m9.figshare.9782777.v1", + {"host": Figshare.hosts[0], "article": "9782777"}, + ), + ( + "https://doi.org/10.6084/m9.figshare.9782777", + {"host": Figshare.hosts[0], "article": "9782777"}, + ), + ( + "https://figshare.com/articles/title/97827771234", + {"host": Figshare.hosts[0], "article": "97827771234"}, + ), + ( + "https://figshare.com/articles/title/9782777/1", + {"host": Figshare.hosts[0], "article": "9782777"}, + ), + ( + "https://figshare.com/articles/title/9782777/", + {"host": Figshare.hosts[0], "article": "9782777"}, + ), +] + +test_spec = {"host": Figshare.hosts[0], "article": "1234"} + + +@pytest.mark.parametrize("test_input,expected", test_dois_links) +def test_detect_figshare(test_input, expected): + assert Figshare().detect(test_input) == expected + + +def test_detect_not_figshare(): + assert Figshare().detect("/some/path/here") is None + assert Figshare().detect("https://example.com/path/here") is None + assert Figshare().detect("10.21105/joss.01277") is None + assert Figshare().detect("10.5281/zenodo.3232985") is None + assert Figshare().detect("https://doi.org/10.21105/joss.01277") is None + + +@contextmanager +def figshare_archive(prefix="a_directory"): + with NamedTemporaryFile(suffix=".zip") as zfile: + with ZipFile(zfile.name, mode="w") as zip: + zip.writestr("{}/some-file.txt".format(prefix), "some content") + zip.writestr("{}/some-other-file.txt".format(prefix), "some more content") + + yield zfile.name + + +def test_fetch_zip(): + # see test_zenodo.py/test_fetch_software + with figshare_archive() as fig_path: + mock_response = BytesIO( + json.dumps( + { + "files": [ + { + "name": "afake.zip", + "is_link_only": False, + "download_url": "file://{}".format(fig_path), + } + ] + } + ).encode("utf-8") + ) + + def mock_urlopen(self, req): + if isinstance(req, Request): + return mock_response + else: + return urlopen(req) + + with patch.object(Figshare, "urlopen", new=mock_urlopen): + with TemporaryDirectory() as d: + fig = Figshare() + output = [] + for l in fig.fetch(test_spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + expected = set(["some-other-file.txt", "some-file.txt"]) + assert expected == unpacked_files + + +def test_fetch_data(): + with figshare_archive() as a_path: + with figshare_archive() as b_path: + mock_response = BytesIO( + json.dumps( + { + "files": [ + { + "name": "afake.file", + "download_url": "file://{}".format(a_path), + "is_link_only": False, + }, + { + "name": "bfake.data", + "download_url": "file://{}".format(b_path), + "is_link_only": False, + }, + {"name": "cfake.link", "is_link_only": True}, + ] + } + ).encode("utf-8") + ) + + def mock_urlopen(self, req): + if isinstance(req, Request): + return mock_response + else: + return urlopen(req) + + with patch.object(Figshare, "urlopen", new=mock_urlopen): + with TemporaryDirectory() as d: + fig = Figshare() + + output = [] + for l in fig.fetch(test_spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + # ZIP files shouldn't have been unpacked + expected = {"bfake.data", "afake.file"} + assert expected == unpacked_files diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py index d9f66577..ea337ec8 100644 --- a/tests/unit/contentproviders/test_zenodo.py +++ b/tests/unit/contentproviders/test_zenodo.py @@ -13,7 +13,7 @@ from repo2docker.contentproviders import Zenodo def test_content_id(): - with patch.object(Zenodo, "_urlopen") as fake_urlopen: + with patch.object(Zenodo, "urlopen") as fake_urlopen: fake_urlopen.return_value.url = "https://zenodo.org/record/3232985" zen = Zenodo() @@ -66,7 +66,7 @@ test_hosts = [ @pytest.mark.parametrize("test_input,expected", test_hosts) def test_detect_zenodo(test_input, expected): - with patch.object(Zenodo, "_urlopen") as fake_urlopen: + with patch.object(Zenodo, "urlopen") as fake_urlopen: fake_urlopen.return_value.url = test_input[0] # valid Zenodo DOIs trigger this content provider assert Zenodo().detect(test_input[0]) == expected @@ -75,7 +75,7 @@ def test_detect_zenodo(test_input, expected): # only two of the three calls above have to resolve a DOI assert fake_urlopen.call_count == 2 - with patch.object(Zenodo, "_urlopen") as fake_urlopen: + with patch.object(Zenodo, "urlopen") as fake_urlopen: # Don't trigger the Zenodo content provider assert Zenodo().detect("/some/path/here") is None assert Zenodo().detect("https://example.com/path/here") is None @@ -120,7 +120,7 @@ def test_fetch_software_from_github_archive(): else: return urlopen(req) - with patch.object(Zenodo, "_urlopen", new=mock_urlopen): + with patch.object(Zenodo, "urlopen", new=mock_urlopen): zen = Zenodo() spec = { "host": { @@ -173,7 +173,7 @@ def test_fetch_software(): else: return urlopen(req) - with patch.object(Zenodo, "_urlopen", new=mock_urlopen): + with patch.object(Zenodo, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: zen = Zenodo() spec = spec = { @@ -227,7 +227,7 @@ def test_fetch_data(): else: return urlopen(req) - with patch.object(Zenodo, "_urlopen", new=mock_urlopen): + with patch.object(Zenodo, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: zen = Zenodo() spec = {