Add support for the SWHID content provider

This content provider allows to retrieve the content from a
Software Heritage (SWH) persistent identifier (SWHID).
Typical usage:

  repo2docker swh:1:rev:94dca98c006b80309704c717b5d83dff3c1fa3a0

It uses the SWH public vault API to retrieve the content of the given
directory.

Most of the times, this will not need an authentication
token to bypass the rate-limiting of the SWH API.
Without authentication, one should be allowed to retrieve one
directory content per minute.

If this is not enought, then the user must use authenticated calls to
the SWH API.

For this, a new `swh_token` config item has been added to the Repo2Docker
application class.

To use authentication:

  repo2docker --config cfg.json swh:1:rev:94dca98c006b80309704c717b5d83dff3c1fa3a0

with the swh_token config option being defined in the cfg.json config file.
pull/988/head
David Douard 2020-11-24 18:30:08 +01:00
rodzic 983607fed7
commit e54c24ce2e
5 zmienionych plików z 289 dodań i 0 usunięć

17
repo2docker/app.py 100644 → 100755
Wyświetl plik

@ -148,6 +148,7 @@ class Repo2Docker(Application):
contentproviders.Figshare,
contentproviders.Dataverse,
contentproviders.Hydroshare,
contentproviders.Swhid,
contentproviders.Mercurial,
contentproviders.Git,
],
@ -269,6 +270,18 @@ class Repo2Docker(Application):
allow_none=True,
)
swh_token = Unicode(
None,
help="""
Token to use authenticated SWH API access.
If unset, default to unauthenticated (limited) usage of the Software
Heritage API.
""",
config=True,
allow_none=True,
)
cleanup_checkout = Bool(
False,
help="""
@ -395,6 +408,10 @@ class Repo2Docker(Application):
"No matching content provider found for " "{url}.".format(url=url)
)
swh_token = self.config.get("swh_token", self.swh_token)
if swh_token and isinstance(picked_content_provider, contentproviders.Swhid):
picked_content_provider.set_auth_token(swh_token)
for log_line in picked_content_provider.fetch(
spec, checkout_path, yield_output=self.json_logs
):

Wyświetl plik

@ -5,3 +5,4 @@ from .figshare import Figshare
from .dataverse import Dataverse
from .hydroshare import Hydroshare
from .mercurial import Mercurial
from .swhid import Swhid

Wyświetl plik

@ -0,0 +1,113 @@
import io
import os
import shutil
import tarfile
import time
import re
from os import path
import requests
from .base import ContentProvider
from ..utils import copytree
from .. import __version__
def parse_swhid(swhid):
swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$"
# only parse/check the <identifier_core> of the swhid
# see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
m = re.match(swhid_regexp, swhid.split(";")[0])
if m:
return m.groupdict()
class Swhid(ContentProvider):
"""Provide contents of a repository identified by a SWHID."""
retry_delay = 5
def __init__(self):
self.swhid = None
self.base_url = "https://archive.softwareheritage.org/api/1"
self.session = requests.Session()
self.session.headers.update(
{
"user-agent": "repo2docker {}".format(__version__),
}
)
def set_auth_token(self, token):
header = {"Authorization": "Bearer {}".format(token)}
self.session.headers.update(header)
def _request(self, url, method="GET"):
if not url.endswith("/"):
url = url + "/"
for retries in range(3):
try:
resp = self.session.request(method, url)
if resp.ok:
break
except requests.ConnectionError:
time.sleep(self.retry_delay)
return resp
@property
def content_id(self):
"""The SWHID record ID used for content retrival"""
return self.swhid
def detect(self, swhid, ref=None, extra_args=None):
swhid_dict = parse_swhid(swhid)
if (
swhid_dict
and swhid_dict["type"] in ("dir", "rev")
and swhid_dict["version"] == "1"
):
return {"swhid": swhid, "swhid_obj": swhid_dict}
def fetch_directory(self, dir_hash, output_dir):
url = "{}/vault/directory/{}/".format(self.base_url, dir_hash)
yield "Fetching directory {} from {}\n".format(dir_hash, url)
resp = self._request(url, "POST")
receipt = resp.json()
status = receipt["status"]
assert status != "failed", receipt
while status not in ("failed", "done"):
time.sleep(self.retry_delay)
resp = self._request(url)
status = resp.json()["status"]
if status == "failed":
yield "Error preparing the directory for download"
raise Exception()
resp = self._request(resp.json()["fetch_url"])
archive = tarfile.open(fileobj=io.BytesIO(resp.content))
archive.extractall(path=output_dir)
# the output_dir should have only one subdir named after the dir_hash
# move its content one level up
copytree(path.join(output_dir, dir_hash), output_dir)
shutil.rmtree(path.join(output_dir, dir_hash))
yield "Fetched files: {}\n".format(os.listdir(output_dir))
def fetch(self, spec, output_dir, yield_output=False):
swhid = spec["swhid"]
swhid_obj = spec["swhid_obj"]
if swhid_obj["type"] == "rev":
# need to get the directory for this revision
sha1git = swhid_obj["hash"]
url = "{}/revision/{}/".format(self.base_url, sha1git)
yield "Fetching revision {} from {}\n".format(sha1git, url)
resp = self._request(url)
assert resp.ok, (resp.content, self.session.headers)
directory = resp.json()["directory"]
self.swhid = "swh:1:dir:{}".format(directory)
yield from self.fetch_directory(directory, output_dir)
elif swhid_obj["type"] == "dir":
self.swhid = swhid
yield from self.fetch_directory(swhid_obj["hash"], output_dir)

Wyświetl plik

@ -56,6 +56,7 @@ setup(
"ruamel.yaml>=0.15",
"toml",
"semver",
"requests",
],
python_requires=">=3.6",
author="Project Jupyter Contributors",

Wyświetl plik

@ -0,0 +1,157 @@
import json
import os
import io
import tarfile
import shutil
import re
import urllib
import pytest
import tempfile
import logging
import requests_mock
from os import makedirs
from os.path import join
from unittest.mock import patch, MagicMock, mock_open
from zipfile import ZipFile
from repo2docker.contentproviders.swhid import Swhid, parse_swhid
from repo2docker.contentproviders.base import ContentProviderException
# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir().
# We do not use this later to prevent having to depend on swh.model[cli]
def swhid_of_dir(path):
object = Directory.from_disk(path=path).get_data()
return swhid(DIRECTORY, object)
def test_content_id():
swhid = Swhid()
assert swhid.content_id is None
swhids_ok = [
"swh:1:dir:" + "0" * 40,
"swh:1:rev:" + "0" * 40,
]
swhids_invalid = [
"swh:1:dir:" + "0" * 39,
"swh:2:dir:" + "0" * 40,
"swh:1:rev:" + "0" * 41,
"swh:1:cnt:" + "0" * 40,
"swh:1:ori:" + "0" * 40,
"swh:1:rel:" + "0" * 40,
"swh:1:snp:" + "0" * 40,
]
detect_values = [
(swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok
] + [(swhid, None) for swhid in swhids_invalid]
@pytest.mark.parametrize("swhid, expected", detect_values)
def test_detect(swhid, expected):
provider = Swhid()
assert provider.detect(swhid) == expected
def fake_urlopen(req):
print(req)
return req.headers
def test_unresolving_swhid():
provider = Swhid()
# swhid = "0" * 40
# assert provider.swhid2url(swhid) is swhid
NULLID = "0" * 40
@pytest.fixture
def gen_tarfile(tmpdir):
rootdir = join(tmpdir, "tmp")
makedirs(rootdir)
with open(join(rootdir, "file1.txt"), "wb") as fobj:
fobj.write(b"Some content\n")
# this directory hash can be computed using the swh.model package, but we do
# nto want to depend on this later to limit dependencies and because it
# does not support python 3.6;
dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe"
buf = io.BytesIO()
tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w")
tarf.add(rootdir, arcname=dirhash)
tarf.close()
shutil.rmtree(rootdir)
return dirhash, buf.getvalue()
def mocked_provider(tmpdir, dirhash, tarfile_buf):
provider = Swhid()
adapter = requests_mock.Adapter()
provider.base_url = "mock://api/1"
provider.retry_delay = 0.1
provider.session.mount("mock://", adapter)
adapter.register_uri(
"GET",
"mock://api/1/revision/{}/".format(NULLID),
json={
"author": {"fullname": "John Doe <jdoe@example.com>"},
"directory": dirhash,
},
)
adapter.register_uri(
"POST",
"mock://api/1/vault/directory/{}/".format(dirhash),
json={
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "new",
},
)
adapter.register_uri(
"GET",
"mock://api/1/vault/directory/{}/".format(dirhash),
[
{
"json": {
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "pending",
}
},
{
"json": {
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "done",
}
},
],
)
adapter.register_uri(
"GET",
"mock://api/1/vault/directory/{}/raw/".format(dirhash),
content=tarfile_buf,
)
return provider
def test_fetch_revision(tmpdir, gen_tarfile):
dir_id, tarfile_buf = gen_tarfile
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
swhid = "swh:1:rev:" + NULLID
for log in provider.fetch(provider.detect(swhid), tmpdir):
print(log)
assert provider.content_id == "swh:1:dir:" + dir_id
def test_fetch_directory(tmpdir, gen_tarfile):
dir_id, tarfile_buf = gen_tarfile
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
swhid = "swh:1:dir:" + dir_id
for log in provider.fetch(provider.detect(swhid), tmpdir):
print(log)
assert provider.content_id == swhid