From 0e4029bb8d2002f7321cea592f21561b8d7e9659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20N=C3=BCst?= Date: Sun, 8 Sep 2019 11:53:42 +0200 Subject: [PATCH 1/7] add Figshare content provider moving common functions to a DoiProvider --- docs/source/contributing/roadmap.md | 1 - docs/source/usage.rst | 2 +- repo2docker/app.py | 7 +- repo2docker/contentproviders/__init__.py | 1 + repo2docker/contentproviders/doi.py | 95 ++++++++++++ repo2docker/contentproviders/figshare.py | 85 +++++++++++ repo2docker/contentproviders/zenodo.py | 84 +---------- tests/unit/contentproviders/test_figshare.py | 148 +++++++++++++++++++ tests/unit/contentproviders/test_zenodo.py | 12 +- 9 files changed, 350 insertions(+), 85 deletions(-) create mode 100644 repo2docker/contentproviders/doi.py create mode 100644 repo2docker/contentproviders/figshare.py create mode 100644 tests/unit/contentproviders/test_figshare.py diff --git a/docs/source/contributing/roadmap.md b/docs/source/contributing/roadmap.md index c14ea387..24af47a9 100644 --- a/docs/source/contributing/roadmap.md +++ b/docs/source/contributing/roadmap.md @@ -83,4 +83,3 @@ time there is no active plan for an item. The project would like to find the resources and time to discuss and then execute these ideas. * support execution on a remote host (with more resources than available locally) via the command-line * add support for using ZIP files as the repo (`repo2docker https://example.com/an-archive.zip`) this will give us access to several archives (like Zenodo) that expose things as ZIP files. -* add support for Zenodo (`repo2docker 10.5281/zenodo.1476680`) so Zenodo software archives can be used as the source in addition to a git repository diff --git a/docs/source/usage.rst b/docs/source/usage.rst index b0fff1be..aae58cc7 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -12,7 +12,7 @@ Using ``repo2docker`` ``repo2docker`` can build a reproducible computational environment for any repository that follows :ref:`specification`. repo2docker is called with the URL of a Git repository, -a Zenodo DOI or a path to a local directory. It then +a DOI from Zenodo or Figshare, or a path to a local directory. It then performs these steps: 1. Inspects the repository for :ref:`configuration files `. These will be used to build diff --git a/repo2docker/app.py b/repo2docker/app.py index 755f0af1..ba0a0313 100644 --- a/repo2docker/app.py +++ b/repo2docker/app.py @@ -142,7 +142,12 @@ class Repo2Docker(Application): # detecting if something will successfully `git clone` is very hard if all # you can do is look at the path/URL to it. content_providers = List( - [contentproviders.Local, contentproviders.Zenodo, contentproviders.Git], + [ + contentproviders.Local, + contentproviders.Zenodo, + contentproviders.Figshare, + contentproviders.Git, + ], config=True, help=""" Ordered list by priority of ContentProviders to try in turn to fetch diff --git a/repo2docker/contentproviders/__init__.py b/repo2docker/contentproviders/__init__.py index d648731d..f7f9369b 100644 --- a/repo2docker/contentproviders/__init__.py +++ b/repo2docker/contentproviders/__init__.py @@ -1,3 +1,4 @@ from .git import Git from .base import Local from .zenodo import Zenodo +from .figshare import Figshare diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py new file mode 100644 index 00000000..90c087aa --- /dev/null +++ b/repo2docker/contentproviders/doi.py @@ -0,0 +1,95 @@ +import os +import json +import shutil +import logging + +from os import makedirs +from os import path +from urllib.request import urlopen, Request +from urllib.error import HTTPError +from zipfile import ZipFile, is_zipfile + +from .base import ContentProvider +from ..utils import copytree, deep_get +from ..utils import normalize_doi, is_doi +from .. import __version__ + + +class DoiProvider(ContentProvider): + """Provide contents of a repository identified by a DOI and some helper functions.""" + + def urlopen(self, req, headers=None): + """A urlopen() helper""" + # someone passed a string, not a request + if not isinstance(req, Request): + req = Request(req) + + req.add_header("User-Agent", "repo2docker {}".format(__version__)) + if headers is not None: + for key, value in headers.items(): + req.add_header(key, value) + + return urlopen(req) + + def doi2url(self, doi): + # Transform a DOI to a URL + # If not a doi, assume we have a URL and return + if is_doi(doi): + doi = normalize_doi(doi) + + try: + resp = self.urlopen("https://doi.org/{}".format(doi)) + # If the DOI doesn't resolve, just return URL + except HTTPError: + return doi + return resp.url + else: + # Just return what is actulally just a URL + return doi + + def fetch_file(self, file_ref, host, output_dir, unzip=False): + # the assumption is that `unzip=True` means that this is the only + # file related to a record + file_url = deep_get(file_ref, host["download"]) + fname = deep_get(file_ref, host["filename"]) + logging.debug("Downloading file {} as {}\n".format(file_url, fname)) + with self.urlopen(file_url) as src: + if path.dirname(fname): + sub_dir = path.join(output_dir, path.dirname(fname)) + if not path.exists(sub_dir): + yield "Creating {}\n".format(sub_dir) + makedirs(sub_dir, exist_ok=True) + + dst_fname = path.join(output_dir, fname) + with open(dst_fname, "wb") as dst: + yield "Fetching {}\n".format(fname) + shutil.copyfileobj(src, dst) + # first close the newly written file, then continue + # processing it + if unzip and is_zipfile(dst_fname): + yield "Extracting {}\n".format(fname) + zfile = ZipFile(dst_fname) + zfile.extractall(path=output_dir) + zfile.close() + + # delete downloaded file ... + os.remove(dst_fname) + # ... and any directories we might have created, + # in which case sub_dir will be defined + if path.dirname(fname): + shutil.rmtree(sub_dir) + + new_subdirs = os.listdir(output_dir) + # if there is only one new subdirectory move its contents + # to the top level directory + if len(new_subdirs) == 1: + d = new_subdirs[0] + copytree(path.join(output_dir, d), output_dir) + shutil.rmtree(path.join(output_dir, d)) + + yield "Fetched files: {}\n".format(os.listdir(output_dir)) + + @property + def content_id(self): + """The provider's ID for the record""" + return None diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py new file mode 100644 index 00000000..cb798625 --- /dev/null +++ b/repo2docker/contentproviders/figshare.py @@ -0,0 +1,85 @@ +import os +import re +import json +import shutil + +from os import makedirs +from os import path +from urllib.request import Request +from urllib.error import HTTPError +from zipfile import is_zipfile + +from .doi import DoiProvider +from ..utils import copytree, deep_get + + +class Figshare(DoiProvider): + """Provide contents of a Figshare article. + + See https://docs.figshare.com/#public_article for API docs. + + Examples: + - https://doi.org/10.6084/m9.figshare.9782777 + - https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI) + """ + + hosts = [ + { + "hostname": [ + "https://figshare.com/articles/", + "http://figshare.com/articles/", + "https://figshare.com/account/articles/", + ], + "api": "https://api.figshare.com/v2/articles/", + "filepath": "files", + "filename": "name", + "download": "download_url", + } + ] + + url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/\d)?") + + def detect(self, doi, ref=None, extra_args=None): + """Trigger this provider for things that resolve to a Figshare article""" + # We need the hostname (url where records are), api url (for metadata), + # filepath (path to files in metadata), filename (path to filename in + # metadata), download (path to file download URL), and type (path to item type in metadata) + + url = self.doi2url(doi) + + for host in self.hosts: + if any([url.startswith(s) for s in host["hostname"]]): + match = self.url_regex.match(url) + if match: + self.article_id = match.groups()[2] + return {"article": self.article_id, "host": host} + else: + return None + + def fetch(self, spec, output_dir, yield_output=False): + """Fetch and unpack a Figshare article""" + article_id = spec["article"] + host = spec["host"] + + yield "Fetching Figshare article {}.\n".format(article_id) + req = Request( + "{}{}".format(host["api"], article_id), + headers={"accept": "application/json"}, + ) + resp = self.urlopen(req) + + article = json.loads(resp.read().decode("utf-8")) + + files = deep_get(article, host["filepath"]) + # only fetch files where is_link_only: False + files = [file for file in files if not file["is_link_only"]] + only_one_file = len(files) == 1 + for file_ref in files: + unzip = file_ref["name"].endswith(".zip") and only_one_file + for line in self.fetch_file(file_ref, host, output_dir, unzip): + yield line + + @property + def content_id(self): + """The Figshare article ID""" + return self.article_id diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py index 426525ba..204acfc9 100644 --- a/repo2docker/contentproviders/zenodo.py +++ b/repo2docker/contentproviders/zenodo.py @@ -4,48 +4,16 @@ import shutil from os import makedirs from os import path -from urllib.request import urlopen, Request +from urllib.request import Request from urllib.error import HTTPError -from zipfile import ZipFile, is_zipfile -from .base import ContentProvider +from .doi import DoiProvider from ..utils import copytree, deep_get -from ..utils import normalize_doi, is_doi -from .. import __version__ -class Zenodo(ContentProvider): +class Zenodo(DoiProvider): """Provide contents of a Zenodo deposit.""" - def _urlopen(self, req, headers=None): - """A urlopen() helper""" - # someone passed a string, not a request - if not isinstance(req, Request): - req = Request(req) - - req.add_header("User-Agent", "repo2docker {}".format(__version__)) - if headers is not None: - for key, value in headers.items(): - req.add_header(key, value) - - return urlopen(req) - - def _doi2url(self, doi): - # Transform a DOI to a URL - # If not a doi, assume we have a URL and return - if is_doi(doi): - doi = normalize_doi(doi) - - try: - resp = self._urlopen("https://doi.org/{}".format(doi)) - # If the DOI doesn't resolve, just return URL - except HTTPError: - return doi - return resp.url - else: - # Just return what is actulally just a URL - return doi - def detect(self, doi, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Zenodo/Invenio record""" # We need the hostname (url where records are), api url (for metadata), @@ -73,7 +41,7 @@ class Zenodo(ContentProvider): }, ] - url = self._doi2url(doi) + url = self.doi2url(doi) for host in hosts: if any([url.startswith(s) for s in host["hostname"]]): @@ -90,53 +58,17 @@ class Zenodo(ContentProvider): "{}{}".format(host["api"], record_id), headers={"accept": "application/json"}, ) - resp = self._urlopen(req) + resp = self.urlopen(req) record = json.loads(resp.read().decode("utf-8")) - def _fetch(file_ref, unzip=False): - # the assumption is that `unzip=True` means that this is the only - # file related to the zenodo record - with self._urlopen(deep_get(file_ref, host["download"])) as src: - fname = deep_get(file_ref, host["filename"]) - if path.dirname(fname): - sub_dir = path.join(output_dir, path.dirname(fname)) - if not path.exists(sub_dir): - yield "Creating {}\n".format(sub_dir) - makedirs(sub_dir, exist_ok=True) - - dst_fname = path.join(output_dir, fname) - with open(dst_fname, "wb") as dst: - yield "Fetching {}\n".format(fname) - shutil.copyfileobj(src, dst) - # first close the newly written file, then continue - # processing it - if unzip and is_zipfile(dst_fname): - yield "Extracting {}\n".format(fname) - zfile = ZipFile(dst_fname) - zfile.extractall(path=output_dir) - zfile.close() - - # delete downloaded file ... - os.remove(dst_fname) - # ... and any directories we might have created, - # in which case sub_dir will be defined - if path.dirname(fname): - shutil.rmtree(sub_dir) - - new_subdirs = os.listdir(output_dir) - # if there is only one new subdirectory move its contents - # to the top level directory - if len(new_subdirs) == 1: - d = new_subdirs[0] - copytree(path.join(output_dir, d), output_dir) - shutil.rmtree(path.join(output_dir, d)) - is_software = deep_get(record, host["type"]).lower() == "software" files = deep_get(record, host["filepath"]) only_one_file = len(files) == 1 for file_ref in files: - for line in _fetch(file_ref, unzip=is_software and only_one_file): + for line in self.fetch_file( + file_ref, host, output_dir, is_software and only_one_file + ): yield line @property diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py new file mode 100644 index 00000000..4b2b2132 --- /dev/null +++ b/tests/unit/contentproviders/test_figshare.py @@ -0,0 +1,148 @@ +import json +import os +import pytest + +from contextlib import contextmanager +from io import BytesIO +from tempfile import TemporaryDirectory, NamedTemporaryFile +from unittest.mock import patch +from urllib.request import urlopen, Request +from zipfile import ZipFile + +from repo2docker.contentproviders import Figshare + + +def test_content_id(): + with patch.object(Figshare, "urlopen") as fake_urlopen: + fake_urlopen.return_value.url = "https://figshare.com/articles/title/9782777" + fig = Figshare() + fig.detect("10.6084/m9.figshare.9782777") + assert fig.content_id == "9782777" + + +test_dois_links = [ + ("10.6084/m9.figshare.9782777", {"host": Figshare.hosts[0], "article": "9782777"}), + ( + "10.6084/m9.figshare.9782777.v1", + {"host": Figshare.hosts[0], "article": "9782777"}, + ), + ( + "https://doi.org/10.6084/m9.figshare.9782777", + {"host": Figshare.hosts[0], "article": "9782777"}, + ), + ( + "https://figshare.com/articles/title/97827771234", + {"host": Figshare.hosts[0], "article": "97827771234"}, + ), + ( + "https://figshare.com/articles/title/9782777/1", + {"host": Figshare.hosts[0], "article": "9782777"}, + ), + ( + "https://figshare.com/articles/title/9782777/", + {"host": Figshare.hosts[0], "article": "9782777"}, + ), +] + +test_spec = {"host": Figshare.hosts[0], "article": "1234"} + + +@pytest.mark.parametrize("test_input,expected", test_dois_links) +def test_detect_figshare(test_input, expected): + assert Figshare().detect(test_input) == expected + + +def test_detect_not_figshare(): + assert Figshare().detect("/some/path/here") is None + assert Figshare().detect("https://example.com/path/here") is None + assert Figshare().detect("10.21105/joss.01277") is None + assert Figshare().detect("10.5281/zenodo.3232985") is None + assert Figshare().detect("https://doi.org/10.21105/joss.01277") is None + + +@contextmanager +def figshare_archive(prefix="a_directory"): + with NamedTemporaryFile(suffix=".zip") as zfile: + with ZipFile(zfile.name, mode="w") as zip: + zip.writestr("{}/some-file.txt".format(prefix), "some content") + zip.writestr("{}/some-other-file.txt".format(prefix), "some more content") + + yield zfile.name + + +def test_fetch_zip(): + # see test_zenodo.py/test_fetch_software + with figshare_archive() as fig_path: + mock_response = BytesIO( + json.dumps( + { + "files": [ + { + "name": "afake.zip", + "is_link_only": False, + "download_url": "file://{}".format(fig_path), + } + ] + } + ).encode("utf-8") + ) + + def mock_urlopen(self, req): + if isinstance(req, Request): + return mock_response + else: + return urlopen(req) + + with patch.object(Figshare, "urlopen", new=mock_urlopen): + with TemporaryDirectory() as d: + fig = Figshare() + output = [] + for l in fig.fetch(test_spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + expected = set(["some-other-file.txt", "some-file.txt"]) + assert expected == unpacked_files + + +def test_fetch_data(): + with figshare_archive() as a_path: + with figshare_archive() as b_path: + mock_response = BytesIO( + json.dumps( + { + "files": [ + { + "name": "afake.file", + "download_url": "file://{}".format(a_path), + "is_link_only": False, + }, + { + "name": "bfake.data", + "download_url": "file://{}".format(b_path), + "is_link_only": False, + }, + {"name": "cfake.link", "is_link_only": True}, + ] + } + ).encode("utf-8") + ) + + def mock_urlopen(self, req): + if isinstance(req, Request): + return mock_response + else: + return urlopen(req) + + with patch.object(Figshare, "urlopen", new=mock_urlopen): + with TemporaryDirectory() as d: + fig = Figshare() + + output = [] + for l in fig.fetch(test_spec, d): + output.append(l) + + unpacked_files = set(os.listdir(d)) + # ZIP files shouldn't have been unpacked + expected = {"bfake.data", "afake.file"} + assert expected == unpacked_files diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py index d9f66577..ea337ec8 100644 --- a/tests/unit/contentproviders/test_zenodo.py +++ b/tests/unit/contentproviders/test_zenodo.py @@ -13,7 +13,7 @@ from repo2docker.contentproviders import Zenodo def test_content_id(): - with patch.object(Zenodo, "_urlopen") as fake_urlopen: + with patch.object(Zenodo, "urlopen") as fake_urlopen: fake_urlopen.return_value.url = "https://zenodo.org/record/3232985" zen = Zenodo() @@ -66,7 +66,7 @@ test_hosts = [ @pytest.mark.parametrize("test_input,expected", test_hosts) def test_detect_zenodo(test_input, expected): - with patch.object(Zenodo, "_urlopen") as fake_urlopen: + with patch.object(Zenodo, "urlopen") as fake_urlopen: fake_urlopen.return_value.url = test_input[0] # valid Zenodo DOIs trigger this content provider assert Zenodo().detect(test_input[0]) == expected @@ -75,7 +75,7 @@ def test_detect_zenodo(test_input, expected): # only two of the three calls above have to resolve a DOI assert fake_urlopen.call_count == 2 - with patch.object(Zenodo, "_urlopen") as fake_urlopen: + with patch.object(Zenodo, "urlopen") as fake_urlopen: # Don't trigger the Zenodo content provider assert Zenodo().detect("/some/path/here") is None assert Zenodo().detect("https://example.com/path/here") is None @@ -120,7 +120,7 @@ def test_fetch_software_from_github_archive(): else: return urlopen(req) - with patch.object(Zenodo, "_urlopen", new=mock_urlopen): + with patch.object(Zenodo, "urlopen", new=mock_urlopen): zen = Zenodo() spec = { "host": { @@ -173,7 +173,7 @@ def test_fetch_software(): else: return urlopen(req) - with patch.object(Zenodo, "_urlopen", new=mock_urlopen): + with patch.object(Zenodo, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: zen = Zenodo() spec = spec = { @@ -227,7 +227,7 @@ def test_fetch_data(): else: return urlopen(req) - with patch.object(Zenodo, "_urlopen", new=mock_urlopen): + with patch.object(Zenodo, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: zen = Zenodo() spec = { From e6cd4b19d97ba5798812ee848184e962640bae50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20N=C3=BCst?= Date: Sun, 8 Sep 2019 11:54:06 +0200 Subject: [PATCH 2/7] reuse hosts definition in Zenodo tests --- repo2docker/contentproviders/zenodo.py | 52 ++++++++-------- tests/unit/contentproviders/test_zenodo.py | 72 ++-------------------- 2 files changed, 31 insertions(+), 93 deletions(-) diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py index 204acfc9..4712c755 100644 --- a/repo2docker/contentproviders/zenodo.py +++ b/repo2docker/contentproviders/zenodo.py @@ -14,36 +14,36 @@ from ..utils import copytree, deep_get class Zenodo(DoiProvider): """Provide contents of a Zenodo deposit.""" + # We need the hostname (url where records are), api url (for metadata), + # filepath (path to files in metadata), filename (path to filename in + # metadata), download (path to file download URL), and type (path to item type in metadata) + hosts = [ + { + "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"], + "api": "https://zenodo.org/api/records/", + "filepath": "files", + "filename": "filename", + "download": "links.download", + "type": "metadata.upload_type", + }, + { + "hostname": [ + "https://data.caltech.edu/records/", + "http://data.caltech.edu/records/", + ], + "api": "https://data.caltech.edu/api/record/", + "filepath": "metadata.electronic_location_and_access", + "filename": "electronic_name.0", + "download": "uniform_resource_identifier", + "type": "metadata.resourceType.resourceTypeGeneral", + }, + ] + def detect(self, doi, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Zenodo/Invenio record""" - # We need the hostname (url where records are), api url (for metadata), - # filepath (path to files in metadata), filename (path to filename in - # metadata), download (path to file download URL), and type (path to item type in metadata) - hosts = [ - { - "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"], - "api": "https://zenodo.org/api/records/", - "filepath": "files", - "filename": "filename", - "download": "links.download", - "type": "metadata.upload_type", - }, - { - "hostname": [ - "https://data.caltech.edu/records/", - "http://data.caltech.edu/records/", - ], - "api": "https://data.caltech.edu/api/record/", - "filepath": "metadata.electronic_location_and_access", - "filename": "electronic_name.0", - "download": "uniform_resource_identifier", - "type": "metadata.resourceType.resourceTypeGeneral", - }, - ] - url = self.doi2url(doi) - for host in hosts: + for host in self.hosts: if any([url.startswith(s) for s in host["hostname"]]): self.record_id = url.rsplit("/", maxsplit=1)[1] return {"record": self.record_id, "host": host} diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py index ea337ec8..0b401e04 100644 --- a/tests/unit/contentproviders/test_zenodo.py +++ b/tests/unit/contentproviders/test_zenodo.py @@ -28,17 +28,7 @@ test_hosts = [ "10.5281/zenodo.3232985", "https://doi.org/10.5281/zenodo.3232985", ], - { - "host": { - "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"], - "api": "https://zenodo.org/api/records/", - "filepath": "files", - "filename": "filename", - "download": "links.download", - "type": "metadata.upload_type", - }, - "record": "3232985", - }, + {"host": Zenodo.hosts[0], "record": "3232985"}, ), ( [ @@ -46,20 +36,7 @@ test_hosts = [ "10.22002/d1.1235", "https://doi.org/10.22002/d1.1235", ], - { - "host": { - "hostname": [ - "https://data.caltech.edu/records/", - "http://data.caltech.edu/records/", - ], - "api": "https://data.caltech.edu/api/record/", - "filepath": "metadata.electronic_location_and_access", - "filename": "electronic_name.0", - "download": "uniform_resource_identifier", - "type": "metadata.resourceType.resourceTypeGeneral", - }, - "record": "1235", - }, + {"host": Zenodo.hosts[1], "record": "1235"}, ), ] @@ -122,20 +99,7 @@ def test_fetch_software_from_github_archive(): with patch.object(Zenodo, "urlopen", new=mock_urlopen): zen = Zenodo() - spec = { - "host": { - "hostname": [ - "https://zenodo.org/record/", - "http://zenodo.org/record/", - ], - "api": "https://zenodo.org/api/records/", - "filepath": "files", - "filename": "filename", - "download": "links.download", - "type": "metadata.upload_type", - }, - "record": "1234", - } + spec = {"host": Zenodo.hosts[0], "record": "1234"} with TemporaryDirectory() as d: output = [] @@ -176,20 +140,7 @@ def test_fetch_software(): with patch.object(Zenodo, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: zen = Zenodo() - spec = spec = { - "host": { - "hostname": [ - "https://zenodo.org/record/", - "http://zenodo.org/record/", - ], - "api": "https://zenodo.org/api/records/", - "filepath": "files", - "filename": "filename", - "download": "links.download", - "type": "metadata.upload_type", - }, - "record": "1234", - } + spec = spec = {"host": Zenodo.hosts[0], "record": "1234"} output = [] for l in zen.fetch(spec, d): output.append(l) @@ -230,20 +181,7 @@ def test_fetch_data(): with patch.object(Zenodo, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: zen = Zenodo() - spec = { - "host": { - "hostname": [ - "https://zenodo.org/record/", - "http://zenodo.org/record/", - ], - "api": "https://zenodo.org/api/records/", - "filepath": "files", - "filename": "filename", - "download": "links.download", - "type": "metadata.upload_type", - }, - "record": "1234", - } + spec = {"host": Zenodo.hosts[0], "record": "1234"} output = [] for l in zen.fetch(spec, d): output.append(l) From b2b603049227c57849dafd09555b25862654934a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20N=C3=BCst?= Date: Sun, 8 Sep 2019 21:39:07 +0200 Subject: [PATCH 3/7] initialize hosts as instance variable --- repo2docker/contentproviders/figshare.py | 27 +++++------ repo2docker/contentproviders/zenodo.py | 49 ++++++++++---------- tests/unit/contentproviders/test_figshare.py | 15 +++--- tests/unit/contentproviders/test_zenodo.py | 11 +++-- 4 files changed, 53 insertions(+), 49 deletions(-) diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py index cb798625..dac23d1f 100644 --- a/repo2docker/contentproviders/figshare.py +++ b/repo2docker/contentproviders/figshare.py @@ -23,19 +23,20 @@ class Figshare(DoiProvider): - https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI) """ - hosts = [ - { - "hostname": [ - "https://figshare.com/articles/", - "http://figshare.com/articles/", - "https://figshare.com/account/articles/", - ], - "api": "https://api.figshare.com/v2/articles/", - "filepath": "files", - "filename": "name", - "download": "download_url", - } - ] + def __init__(self): + self.hosts = [ + { + "hostname": [ + "https://figshare.com/articles/", + "http://figshare.com/articles/", + "https://figshare.com/account/articles/", + ], + "api": "https://api.figshare.com/v2/articles/", + "filepath": "files", + "filename": "name", + "download": "download_url", + } + ] url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/\d)?") diff --git a/repo2docker/contentproviders/zenodo.py b/repo2docker/contentproviders/zenodo.py index 4712c755..87e7967f 100644 --- a/repo2docker/contentproviders/zenodo.py +++ b/repo2docker/contentproviders/zenodo.py @@ -14,30 +14,31 @@ from ..utils import copytree, deep_get class Zenodo(DoiProvider): """Provide contents of a Zenodo deposit.""" - # We need the hostname (url where records are), api url (for metadata), - # filepath (path to files in metadata), filename (path to filename in - # metadata), download (path to file download URL), and type (path to item type in metadata) - hosts = [ - { - "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"], - "api": "https://zenodo.org/api/records/", - "filepath": "files", - "filename": "filename", - "download": "links.download", - "type": "metadata.upload_type", - }, - { - "hostname": [ - "https://data.caltech.edu/records/", - "http://data.caltech.edu/records/", - ], - "api": "https://data.caltech.edu/api/record/", - "filepath": "metadata.electronic_location_and_access", - "filename": "electronic_name.0", - "download": "uniform_resource_identifier", - "type": "metadata.resourceType.resourceTypeGeneral", - }, - ] + def __init__(self): + # We need the hostname (url where records are), api url (for metadata), + # filepath (path to files in metadata), filename (path to filename in + # metadata), download (path to file download URL), and type (path to item type in metadata) + self.hosts = [ + { + "hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"], + "api": "https://zenodo.org/api/records/", + "filepath": "files", + "filename": "filename", + "download": "links.download", + "type": "metadata.upload_type", + }, + { + "hostname": [ + "https://data.caltech.edu/records/", + "http://data.caltech.edu/records/", + ], + "api": "https://data.caltech.edu/api/record/", + "filepath": "metadata.electronic_location_and_access", + "filename": "electronic_name.0", + "download": "uniform_resource_identifier", + "type": "metadata.resourceType.resourceTypeGeneral", + }, + ] def detect(self, doi, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Zenodo/Invenio record""" diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py index 4b2b2132..58564a5f 100644 --- a/tests/unit/contentproviders/test_figshare.py +++ b/tests/unit/contentproviders/test_figshare.py @@ -20,31 +20,32 @@ def test_content_id(): assert fig.content_id == "9782777" +test_fig = Figshare() test_dois_links = [ - ("10.6084/m9.figshare.9782777", {"host": Figshare.hosts[0], "article": "9782777"}), + ("10.6084/m9.figshare.9782777", {"host": test_fig.hosts[0], "article": "9782777"}), ( "10.6084/m9.figshare.9782777.v1", - {"host": Figshare.hosts[0], "article": "9782777"}, + {"host": test_fig.hosts[0], "article": "9782777"}, ), ( "https://doi.org/10.6084/m9.figshare.9782777", - {"host": Figshare.hosts[0], "article": "9782777"}, + {"host": test_fig.hosts[0], "article": "9782777"}, ), ( "https://figshare.com/articles/title/97827771234", - {"host": Figshare.hosts[0], "article": "97827771234"}, + {"host": test_fig.hosts[0], "article": "97827771234"}, ), ( "https://figshare.com/articles/title/9782777/1", - {"host": Figshare.hosts[0], "article": "9782777"}, + {"host": test_fig.hosts[0], "article": "9782777"}, ), ( "https://figshare.com/articles/title/9782777/", - {"host": Figshare.hosts[0], "article": "9782777"}, + {"host": test_fig.hosts[0], "article": "9782777"}, ), ] -test_spec = {"host": Figshare.hosts[0], "article": "1234"} +test_spec = {"host": test_fig.hosts[0], "article": "1234"} @pytest.mark.parametrize("test_input,expected", test_dois_links) diff --git a/tests/unit/contentproviders/test_zenodo.py b/tests/unit/contentproviders/test_zenodo.py index 0b401e04..61dcbfbf 100644 --- a/tests/unit/contentproviders/test_zenodo.py +++ b/tests/unit/contentproviders/test_zenodo.py @@ -21,6 +21,7 @@ def test_content_id(): assert zen.content_id == "3232985" +test_zen = Zenodo() test_hosts = [ ( [ @@ -28,7 +29,7 @@ test_hosts = [ "10.5281/zenodo.3232985", "https://doi.org/10.5281/zenodo.3232985", ], - {"host": Zenodo.hosts[0], "record": "3232985"}, + {"host": test_zen.hosts[0], "record": "3232985"}, ), ( [ @@ -36,7 +37,7 @@ test_hosts = [ "10.22002/d1.1235", "https://doi.org/10.22002/d1.1235", ], - {"host": Zenodo.hosts[1], "record": "1235"}, + {"host": test_zen.hosts[1], "record": "1235"}, ), ] @@ -99,7 +100,7 @@ def test_fetch_software_from_github_archive(): with patch.object(Zenodo, "urlopen", new=mock_urlopen): zen = Zenodo() - spec = {"host": Zenodo.hosts[0], "record": "1234"} + spec = {"host": test_zen.hosts[0], "record": "1234"} with TemporaryDirectory() as d: output = [] @@ -140,7 +141,7 @@ def test_fetch_software(): with patch.object(Zenodo, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: zen = Zenodo() - spec = spec = {"host": Zenodo.hosts[0], "record": "1234"} + spec = spec = {"host": test_zen.hosts[0], "record": "1234"} output = [] for l in zen.fetch(spec, d): output.append(l) @@ -181,7 +182,7 @@ def test_fetch_data(): with patch.object(Zenodo, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: zen = Zenodo() - spec = {"host": Zenodo.hosts[0], "record": "1234"} + spec = {"host": test_zen.hosts[0], "record": "1234"} output = [] for l in zen.fetch(spec, d): output.append(l) From b2b75600c0fbb902795769ead79c4c042fbed15d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20N=C3=BCst?= Date: Mon, 9 Sep 2019 08:37:02 +0200 Subject: [PATCH 4/7] handle versions in Figshare properly --- repo2docker/contentproviders/figshare.py | 25 +++++++++-- tests/unit/contentproviders/test_figshare.py | 45 ++++++++++++++------ 2 files changed, 52 insertions(+), 18 deletions(-) diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py index dac23d1f..f481f945 100644 --- a/repo2docker/contentproviders/figshare.py +++ b/repo2docker/contentproviders/figshare.py @@ -20,6 +20,7 @@ class Figshare(DoiProvider): Examples: - https://doi.org/10.6084/m9.figshare.9782777 + - https://doi.org/10.6084/m9.figshare.9782777.v2 - https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI) """ @@ -38,7 +39,7 @@ class Figshare(DoiProvider): } ] - url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/\d)?") + url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/)?(\d)?") def detect(self, doi, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Figshare article""" @@ -53,7 +54,14 @@ class Figshare(DoiProvider): match = self.url_regex.match(url) if match: self.article_id = match.groups()[2] - return {"article": self.article_id, "host": host} + self.article_version = match.groups()[4] + if not self.article_version: + self.article_version = "1" + return { + "article": self.article_id, + "host": host, + "version": self.article_version, + } else: return None @@ -62,9 +70,13 @@ class Figshare(DoiProvider): article_id = spec["article"] host = spec["host"] - yield "Fetching Figshare article {}.\n".format(article_id) + yield "Fetching Figshare article {} in version {}.\n".format( + self.article_id, self.article_version + ) req = Request( - "{}{}".format(host["api"], article_id), + "{}{}/versions/{}".format( + host["api"], self.article_id, self.article_version + ), headers={"accept": "application/json"}, ) resp = self.urlopen(req) @@ -84,3 +96,8 @@ class Figshare(DoiProvider): def content_id(self): """The Figshare article ID""" return self.article_id + + @property + def content_version(self): + """The Figshare article ID""" + return self.article_version diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py index 58564a5f..3f40d37b 100644 --- a/tests/unit/contentproviders/test_figshare.py +++ b/tests/unit/contentproviders/test_figshare.py @@ -1,5 +1,6 @@ import json import os +import re import pytest from contextlib import contextmanager @@ -10,6 +11,7 @@ from urllib.request import urlopen, Request from zipfile import ZipFile from repo2docker.contentproviders import Figshare +from repo2docker.__main__ import make_r2d def test_content_id(): @@ -21,31 +23,49 @@ def test_content_id(): test_fig = Figshare() +test_fig.article_id = "123456" +test_fig.article_version = "42" + test_dois_links = [ - ("10.6084/m9.figshare.9782777", {"host": test_fig.hosts[0], "article": "9782777"}), ( - "10.6084/m9.figshare.9782777.v1", - {"host": test_fig.hosts[0], "article": "9782777"}, + "10.6084/m9.figshare.9782777", + {"host": test_fig.hosts[0], "article": "9782777", "version": "1"}, ), ( - "https://doi.org/10.6084/m9.figshare.9782777", - {"host": test_fig.hosts[0], "article": "9782777"}, + "10.6084/m9.figshare.9782777.v1", + {"host": test_fig.hosts[0], "article": "9782777", "version": "1"}, + ), + ( + "10.6084/m9.figshare.9782777.v2", + {"host": test_fig.hosts[0], "article": "9782777", "version": "2"}, + ), + ( + "https://doi.org/10.6084/m9.figshare.9782777.v1", + {"host": test_fig.hosts[0], "article": "9782777", "version": "1"}, + ), + ( + "https://doi.org/10.6084/m9.figshare.9782777.v3", + {"host": test_fig.hosts[0], "article": "9782777", "version": "3"}, ), ( "https://figshare.com/articles/title/97827771234", - {"host": test_fig.hosts[0], "article": "97827771234"}, + {"host": test_fig.hosts[0], "article": "97827771234", "version": "1"}, ), ( "https://figshare.com/articles/title/9782777/1", - {"host": test_fig.hosts[0], "article": "9782777"}, + {"host": test_fig.hosts[0], "article": "9782777", "version": "1"}, + ), + ( + "https://figshare.com/articles/title/9782777/2", + {"host": test_fig.hosts[0], "article": "9782777", "version": "2"}, ), ( "https://figshare.com/articles/title/9782777/", - {"host": test_fig.hosts[0], "article": "9782777"}, + {"host": test_fig.hosts[0], "article": "9782777", "version": "1"}, ), ] -test_spec = {"host": test_fig.hosts[0], "article": "1234"} +test_spec = {"host": test_fig.hosts[0], "article": "1234", "version": "42"} @pytest.mark.parametrize("test_input,expected", test_dois_links) @@ -96,9 +116,8 @@ def test_fetch_zip(): with patch.object(Figshare, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: - fig = Figshare() output = [] - for l in fig.fetch(test_spec, d): + for l in test_fig.fetch(test_spec, d): output.append(l) unpacked_files = set(os.listdir(d)) @@ -137,10 +156,8 @@ def test_fetch_data(): with patch.object(Figshare, "urlopen", new=mock_urlopen): with TemporaryDirectory() as d: - fig = Figshare() - output = [] - for l in fig.fetch(test_spec, d): + for l in test_fig.fetch(test_spec, d): output.append(l) unpacked_files = set(os.listdir(d)) From 698b8a1d7eb4e631aa75270adc2082fd64630640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20N=C3=BCst?= Date: Wed, 11 Sep 2019 13:17:48 +0200 Subject: [PATCH 5/7] try to improve test coverage --- repo2docker/contentproviders/doi.py | 13 ++++--- repo2docker/contentproviders/figshare.py | 7 ++-- tests/unit/contentproviders/test_doi.py | 41 ++++++++++++++++++++ tests/unit/contentproviders/test_figshare.py | 6 ++- 4 files changed, 56 insertions(+), 11 deletions(-) create mode 100644 tests/unit/contentproviders/test_doi.py diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py index 90c087aa..83ef79a7 100644 --- a/repo2docker/contentproviders/doi.py +++ b/repo2docker/contentproviders/doi.py @@ -1,11 +1,12 @@ import os +import abc import json import shutil import logging from os import makedirs from os import path -from urllib.request import urlopen, Request +from urllib import request # urlopen, Request from urllib.error import HTTPError from zipfile import ZipFile, is_zipfile @@ -21,15 +22,15 @@ class DoiProvider(ContentProvider): def urlopen(self, req, headers=None): """A urlopen() helper""" # someone passed a string, not a request - if not isinstance(req, Request): - req = Request(req) + if not isinstance(req, request.Request): + req = request.Request(req) req.add_header("User-Agent", "repo2docker {}".format(__version__)) if headers is not None: for key, value in headers.items(): req.add_header(key, value) - return urlopen(req) + return request.urlopen(req) def doi2url(self, doi): # Transform a DOI to a URL @@ -90,6 +91,6 @@ class DoiProvider(ContentProvider): yield "Fetched files: {}\n".format(os.listdir(output_dir)) @property + @abc.abstractmethod def content_id(self): - """The provider's ID for the record""" - return None + pass diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py index f481f945..7cb6bcd9 100644 --- a/repo2docker/contentproviders/figshare.py +++ b/repo2docker/contentproviders/figshare.py @@ -68,15 +68,14 @@ class Figshare(DoiProvider): def fetch(self, spec, output_dir, yield_output=False): """Fetch and unpack a Figshare article""" article_id = spec["article"] + article_version = spec["version"] host = spec["host"] yield "Fetching Figshare article {} in version {}.\n".format( - self.article_id, self.article_version + article_id, article_version ) req = Request( - "{}{}/versions/{}".format( - host["api"], self.article_id, self.article_version - ), + "{}{}/versions/{}".format(host["api"], article_id, article_version), headers={"accept": "application/json"}, ) resp = self.urlopen(req) diff --git a/tests/unit/contentproviders/test_doi.py b/tests/unit/contentproviders/test_doi.py new file mode 100644 index 00000000..16d321e6 --- /dev/null +++ b/tests/unit/contentproviders/test_doi.py @@ -0,0 +1,41 @@ +import json +import os +import re +import urllib +import pytest +import tempfile +import logging + +from unittest.mock import patch, MagicMock, mock_open +from zipfile import ZipFile + +from repo2docker.contentproviders.doi import DoiProvider +from repo2docker.contentproviders.base import ContentProviderException + + +def test_content_id(): + doi = DoiProvider() + assert doi.content_id is None + + +def fake_urlopen(req): + print(req) + return req.headers + + +@patch("urllib.request.urlopen", fake_urlopen) +def test_url_headers(): + doi = DoiProvider() + + headers = {"test1": "value1", "Test2": "value2"} + result = doi.urlopen("https://mybinder.org", headers=headers) + assert "Test1" in result + assert "Test2" in result + assert len(result) is 3 # User-agent is also set + + +def test_unresolving_doi(): + doi = DoiProvider() + + fakedoi = "10.1/1234" + assert doi.doi2url(fakedoi) is fakedoi diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py index 3f40d37b..2b1cdf7e 100644 --- a/tests/unit/contentproviders/test_figshare.py +++ b/tests/unit/contentproviders/test_figshare.py @@ -63,9 +63,13 @@ test_dois_links = [ "https://figshare.com/articles/title/9782777/", {"host": test_fig.hosts[0], "article": "9782777", "version": "1"}, ), + ( + "https://figshare.com/articles/title/9782777/1234", + {"host": test_fig.hosts[0], "article": "9782777", "version": "1234"}, + ), ] -test_spec = {"host": test_fig.hosts[0], "article": "1234", "version": "42"} +test_spec = {"host": test_fig.hosts[0], "article": "123456", "version": "42"} @pytest.mark.parametrize("test_input,expected", test_dois_links) From f1ec646d2e0944eefb3ef0637019efdf8bb6720e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20N=C3=BCst?= Date: Wed, 11 Sep 2019 13:18:20 +0200 Subject: [PATCH 6/7] add support for Figshare versions > 9 --- repo2docker/contentproviders/figshare.py | 2 +- tests/unit/contentproviders/test_figshare.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py index 7cb6bcd9..1cec5ef2 100644 --- a/repo2docker/contentproviders/figshare.py +++ b/repo2docker/contentproviders/figshare.py @@ -39,7 +39,7 @@ class Figshare(DoiProvider): } ] - url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/)?(\d)?") + url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/)?(\d+)?") def detect(self, doi, ref=None, extra_args=None): """Trigger this provider for things that resolve to a Figshare article""" diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py index 2b1cdf7e..2d15a825 100644 --- a/tests/unit/contentproviders/test_figshare.py +++ b/tests/unit/contentproviders/test_figshare.py @@ -21,6 +21,17 @@ def test_content_id(): fig.detect("10.6084/m9.figshare.9782777") assert fig.content_id == "9782777" + fig.detect("10.6084/m9.figshare.9782777.v123") + assert fig.content_id == "9782777" + + +def test_content_version(): + with patch.object(Figshare, "urlopen") as fake_urlopen: + fake_urlopen.return_value.url = "https://figshare.com/articles/title/9782777/2" + fig = Figshare() + fig.detect("10.6084/m9.figshare.9782777.v2") + assert fig.content_version == "2" + test_fig = Figshare() test_fig.article_id = "123456" From a40e1799bdb900c790165c8e7501f7fb00f380f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20N=C3=BCst?= Date: Wed, 11 Sep 2019 13:45:16 +0200 Subject: [PATCH 7/7] fix content_id for Figshare --- repo2docker/contentproviders/doi.py | 6 ----- repo2docker/contentproviders/figshare.py | 7 +----- tests/unit/contentproviders/test_figshare.py | 25 +++++++++----------- 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py index 83ef79a7..6ec2c8e1 100644 --- a/repo2docker/contentproviders/doi.py +++ b/repo2docker/contentproviders/doi.py @@ -1,5 +1,4 @@ import os -import abc import json import shutil import logging @@ -89,8 +88,3 @@ class DoiProvider(ContentProvider): shutil.rmtree(path.join(output_dir, d)) yield "Fetched files: {}\n".format(os.listdir(output_dir)) - - @property - @abc.abstractmethod - def content_id(self): - pass diff --git a/repo2docker/contentproviders/figshare.py b/repo2docker/contentproviders/figshare.py index 1cec5ef2..36aa8067 100644 --- a/repo2docker/contentproviders/figshare.py +++ b/repo2docker/contentproviders/figshare.py @@ -94,9 +94,4 @@ class Figshare(DoiProvider): @property def content_id(self): """The Figshare article ID""" - return self.article_id - - @property - def content_version(self): - """The Figshare article ID""" - return self.article_version + return "{}.v{}".format(self.article_id, self.article_version) diff --git a/tests/unit/contentproviders/test_figshare.py b/tests/unit/contentproviders/test_figshare.py index 2d15a825..0e152ecf 100644 --- a/tests/unit/contentproviders/test_figshare.py +++ b/tests/unit/contentproviders/test_figshare.py @@ -14,23 +14,20 @@ from repo2docker.contentproviders import Figshare from repo2docker.__main__ import make_r2d -def test_content_id(): +test_content_ids = [ + ("https://figshare.com/articles/title/9782777", "9782777.v1"), + ("https://figshare.com/articles/title/9782777/2", "9782777.v2"), + ("https://figshare.com/articles/title/9782777/1234", "9782777.v1234"), +] + + +@pytest.mark.parametrize("link,expected", test_content_ids) +def test_content_id(link, expected): with patch.object(Figshare, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = "https://figshare.com/articles/title/9782777" + fake_urlopen.return_value.url = link fig = Figshare() fig.detect("10.6084/m9.figshare.9782777") - assert fig.content_id == "9782777" - - fig.detect("10.6084/m9.figshare.9782777.v123") - assert fig.content_id == "9782777" - - -def test_content_version(): - with patch.object(Figshare, "urlopen") as fake_urlopen: - fake_urlopen.return_value.url = "https://figshare.com/articles/title/9782777/2" - fig = Figshare() - fig.detect("10.6084/m9.figshare.9782777.v2") - assert fig.content_version == "2" + assert fig.content_id == expected test_fig = Figshare()