From e54c24ce2ee540805d6e4f74a355da995679fa3c Mon Sep 17 00:00:00 2001 From: David Douard Date: Tue, 24 Nov 2020 18:30:08 +0100 Subject: [PATCH] Add support for the SWHID content provider This content provider allows to retrieve the content from a Software Heritage (SWH) persistent identifier (SWHID). Typical usage: repo2docker swh:1:rev:94dca98c006b80309704c717b5d83dff3c1fa3a0 It uses the SWH public vault API to retrieve the content of the given directory. Most of the times, this will not need an authentication token to bypass the rate-limiting of the SWH API. Without authentication, one should be allowed to retrieve one directory content per minute. If this is not enought, then the user must use authenticated calls to the SWH API. For this, a new `swh_token` config item has been added to the Repo2Docker application class. To use authentication: repo2docker --config cfg.json swh:1:rev:94dca98c006b80309704c717b5d83dff3c1fa3a0 with the swh_token config option being defined in the cfg.json config file. --- repo2docker/app.py | 17 +++ repo2docker/contentproviders/__init__.py | 1 + repo2docker/contentproviders/swhid.py | 113 ++++++++++++++++ setup.py | 1 + tests/unit/contentproviders/test_swhid.py | 157 ++++++++++++++++++++++ 5 files changed, 289 insertions(+) mode change 100644 => 100755 repo2docker/app.py create mode 100644 repo2docker/contentproviders/swhid.py create mode 100644 tests/unit/contentproviders/test_swhid.py diff --git a/repo2docker/app.py b/repo2docker/app.py old mode 100644 new mode 100755 index 1ffd2218..5b553712 --- a/repo2docker/app.py +++ b/repo2docker/app.py @@ -148,6 +148,7 @@ class Repo2Docker(Application): contentproviders.Figshare, contentproviders.Dataverse, contentproviders.Hydroshare, + contentproviders.Swhid, contentproviders.Mercurial, contentproviders.Git, ], @@ -269,6 +270,18 @@ class Repo2Docker(Application): allow_none=True, ) + swh_token = Unicode( + None, + help=""" + Token to use authenticated SWH API access. + + If unset, default to unauthenticated (limited) usage of the Software + Heritage API. + """, + config=True, + allow_none=True, + ) + cleanup_checkout = Bool( False, help=""" @@ -395,6 +408,10 @@ class Repo2Docker(Application): "No matching content provider found for " "{url}.".format(url=url) ) + swh_token = self.config.get("swh_token", self.swh_token) + if swh_token and isinstance(picked_content_provider, contentproviders.Swhid): + picked_content_provider.set_auth_token(swh_token) + for log_line in picked_content_provider.fetch( spec, checkout_path, yield_output=self.json_logs ): diff --git a/repo2docker/contentproviders/__init__.py b/repo2docker/contentproviders/__init__.py index ae0b8c27..6398c233 100755 --- a/repo2docker/contentproviders/__init__.py +++ b/repo2docker/contentproviders/__init__.py @@ -5,3 +5,4 @@ from .figshare import Figshare from .dataverse import Dataverse from .hydroshare import Hydroshare from .mercurial import Mercurial +from .swhid import Swhid diff --git a/repo2docker/contentproviders/swhid.py b/repo2docker/contentproviders/swhid.py new file mode 100644 index 00000000..e2050177 --- /dev/null +++ b/repo2docker/contentproviders/swhid.py @@ -0,0 +1,113 @@ +import io +import os +import shutil +import tarfile +import time +import re + +from os import path + +import requests + +from .base import ContentProvider +from ..utils import copytree +from .. import __version__ + + +def parse_swhid(swhid): + swhid_regexp = r"^swh:(?P\d+):(?Pori|cnt|rev|dir|snp|rel):(?P[0-9a-f]{40})$" + # only parse/check the of the swhid + # see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html + m = re.match(swhid_regexp, swhid.split(";")[0]) + if m: + return m.groupdict() + + +class Swhid(ContentProvider): + """Provide contents of a repository identified by a SWHID.""" + + retry_delay = 5 + + def __init__(self): + self.swhid = None + self.base_url = "https://archive.softwareheritage.org/api/1" + self.session = requests.Session() + self.session.headers.update( + { + "user-agent": "repo2docker {}".format(__version__), + } + ) + + def set_auth_token(self, token): + header = {"Authorization": "Bearer {}".format(token)} + self.session.headers.update(header) + + def _request(self, url, method="GET"): + if not url.endswith("/"): + url = url + "/" + + for retries in range(3): + try: + resp = self.session.request(method, url) + if resp.ok: + break + except requests.ConnectionError: + time.sleep(self.retry_delay) + + return resp + + @property + def content_id(self): + """The SWHID record ID used for content retrival""" + return self.swhid + + def detect(self, swhid, ref=None, extra_args=None): + swhid_dict = parse_swhid(swhid) + + if ( + swhid_dict + and swhid_dict["type"] in ("dir", "rev") + and swhid_dict["version"] == "1" + ): + return {"swhid": swhid, "swhid_obj": swhid_dict} + + def fetch_directory(self, dir_hash, output_dir): + url = "{}/vault/directory/{}/".format(self.base_url, dir_hash) + yield "Fetching directory {} from {}\n".format(dir_hash, url) + resp = self._request(url, "POST") + receipt = resp.json() + status = receipt["status"] + assert status != "failed", receipt + while status not in ("failed", "done"): + time.sleep(self.retry_delay) + resp = self._request(url) + status = resp.json()["status"] + if status == "failed": + yield "Error preparing the directory for download" + raise Exception() + resp = self._request(resp.json()["fetch_url"]) + archive = tarfile.open(fileobj=io.BytesIO(resp.content)) + archive.extractall(path=output_dir) + # the output_dir should have only one subdir named after the dir_hash + # move its content one level up + copytree(path.join(output_dir, dir_hash), output_dir) + shutil.rmtree(path.join(output_dir, dir_hash)) + yield "Fetched files: {}\n".format(os.listdir(output_dir)) + + def fetch(self, spec, output_dir, yield_output=False): + swhid = spec["swhid"] + swhid_obj = spec["swhid_obj"] + + if swhid_obj["type"] == "rev": + # need to get the directory for this revision + sha1git = swhid_obj["hash"] + url = "{}/revision/{}/".format(self.base_url, sha1git) + yield "Fetching revision {} from {}\n".format(sha1git, url) + resp = self._request(url) + assert resp.ok, (resp.content, self.session.headers) + directory = resp.json()["directory"] + self.swhid = "swh:1:dir:{}".format(directory) + yield from self.fetch_directory(directory, output_dir) + elif swhid_obj["type"] == "dir": + self.swhid = swhid + yield from self.fetch_directory(swhid_obj["hash"], output_dir) diff --git a/setup.py b/setup.py index 8bfe64eb..dab829d7 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ setup( "ruamel.yaml>=0.15", "toml", "semver", + "requests", ], python_requires=">=3.6", author="Project Jupyter Contributors", diff --git a/tests/unit/contentproviders/test_swhid.py b/tests/unit/contentproviders/test_swhid.py new file mode 100644 index 00000000..953218e3 --- /dev/null +++ b/tests/unit/contentproviders/test_swhid.py @@ -0,0 +1,157 @@ +import json +import os +import io +import tarfile +import shutil +import re +import urllib +import pytest +import tempfile +import logging +import requests_mock + +from os import makedirs +from os.path import join +from unittest.mock import patch, MagicMock, mock_open +from zipfile import ZipFile + +from repo2docker.contentproviders.swhid import Swhid, parse_swhid +from repo2docker.contentproviders.base import ContentProviderException + + +# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir(). +# We do not use this later to prevent having to depend on swh.model[cli] +def swhid_of_dir(path): + object = Directory.from_disk(path=path).get_data() + return swhid(DIRECTORY, object) + + +def test_content_id(): + swhid = Swhid() + assert swhid.content_id is None + + +swhids_ok = [ + "swh:1:dir:" + "0" * 40, + "swh:1:rev:" + "0" * 40, +] +swhids_invalid = [ + "swh:1:dir:" + "0" * 39, + "swh:2:dir:" + "0" * 40, + "swh:1:rev:" + "0" * 41, + "swh:1:cnt:" + "0" * 40, + "swh:1:ori:" + "0" * 40, + "swh:1:rel:" + "0" * 40, + "swh:1:snp:" + "0" * 40, +] + +detect_values = [ + (swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok +] + [(swhid, None) for swhid in swhids_invalid] + + +@pytest.mark.parametrize("swhid, expected", detect_values) +def test_detect(swhid, expected): + provider = Swhid() + assert provider.detect(swhid) == expected + + +def fake_urlopen(req): + print(req) + return req.headers + + +def test_unresolving_swhid(): + provider = Swhid() + + # swhid = "0" * 40 + # assert provider.swhid2url(swhid) is swhid + + +NULLID = "0" * 40 + + +@pytest.fixture +def gen_tarfile(tmpdir): + rootdir = join(tmpdir, "tmp") + makedirs(rootdir) + with open(join(rootdir, "file1.txt"), "wb") as fobj: + fobj.write(b"Some content\n") + + # this directory hash can be computed using the swh.model package, but we do + # nto want to depend on this later to limit dependencies and because it + # does not support python 3.6; + dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe" + buf = io.BytesIO() + tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w") + tarf.add(rootdir, arcname=dirhash) + tarf.close() + shutil.rmtree(rootdir) + return dirhash, buf.getvalue() + + +def mocked_provider(tmpdir, dirhash, tarfile_buf): + provider = Swhid() + adapter = requests_mock.Adapter() + provider.base_url = "mock://api/1" + provider.retry_delay = 0.1 + provider.session.mount("mock://", adapter) + + adapter.register_uri( + "GET", + "mock://api/1/revision/{}/".format(NULLID), + json={ + "author": {"fullname": "John Doe "}, + "directory": dirhash, + }, + ) + adapter.register_uri( + "POST", + "mock://api/1/vault/directory/{}/".format(dirhash), + json={ + "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), + "status": "new", + }, + ) + adapter.register_uri( + "GET", + "mock://api/1/vault/directory/{}/".format(dirhash), + [ + { + "json": { + "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), + "status": "pending", + } + }, + { + "json": { + "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), + "status": "done", + } + }, + ], + ) + adapter.register_uri( + "GET", + "mock://api/1/vault/directory/{}/raw/".format(dirhash), + content=tarfile_buf, + ) + return provider + + +def test_fetch_revision(tmpdir, gen_tarfile): + dir_id, tarfile_buf = gen_tarfile + provider = mocked_provider(tmpdir, dir_id, tarfile_buf) + swhid = "swh:1:rev:" + NULLID + for log in provider.fetch(provider.detect(swhid), tmpdir): + print(log) + assert provider.content_id == "swh:1:dir:" + dir_id + + +def test_fetch_directory(tmpdir, gen_tarfile): + dir_id, tarfile_buf = gen_tarfile + provider = mocked_provider(tmpdir, dir_id, tarfile_buf) + swhid = "swh:1:dir:" + dir_id + for log in provider.fetch(provider.detect(swhid), tmpdir): + print(log) + assert provider.content_id == swhid