kopia lustrzana https://github.com/jupyterhub/repo2docker
Merge pull request #988 from douardda/swhid
commit
1140dd1919
|
@ -14,6 +14,8 @@ Using ``repo2docker``
|
|||
follows :ref:`specification`. repo2docker is called with the URL of a Git repository,
|
||||
a `DOI <https://en.wikipedia.org/wiki/Digital_object_identifier>`_ from Zenodo or Figshare,
|
||||
a `Handle <https://en.wikipedia.org/wiki/Handle_System>`_ or DOI from a Dataverse installation,
|
||||
a `SWHID`_ of a directory of a revision archived in the
|
||||
`Software Heritage Archive <https://archive.softwareheritage.org>`_,
|
||||
or a path to a local directory.
|
||||
|
||||
It then performs these steps:
|
||||
|
@ -36,7 +38,8 @@ repo2docker is called with this command::
|
|||
where ``<source-repository>`` is:
|
||||
|
||||
* a URL of a Git repository (``https://github.com/binder-examples/requirements``),
|
||||
* a Zenodo DOI (``10.5281/zenodo.1211089``), or
|
||||
* a Zenodo DOI (``10.5281/zenodo.1211089``),
|
||||
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or
|
||||
* a path to a local directory (``a/local/directory``)
|
||||
|
||||
of the source repository you want to build.
|
||||
|
@ -132,3 +135,4 @@ Command line API
|
|||
|
||||
|
||||
.. _Pytudes: https://github.com/norvig/pytudes
|
||||
.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
|
||||
|
|
|
@ -148,6 +148,7 @@ class Repo2Docker(Application):
|
|||
contentproviders.Figshare,
|
||||
contentproviders.Dataverse,
|
||||
contentproviders.Hydroshare,
|
||||
contentproviders.Swhid,
|
||||
contentproviders.Mercurial,
|
||||
contentproviders.Git,
|
||||
],
|
||||
|
@ -269,6 +270,18 @@ class Repo2Docker(Application):
|
|||
allow_none=True,
|
||||
)
|
||||
|
||||
swh_token = Unicode(
|
||||
None,
|
||||
help="""
|
||||
Token to use authenticated SWH API access.
|
||||
|
||||
If unset, default to unauthenticated (limited) usage of the Software
|
||||
Heritage API.
|
||||
""",
|
||||
config=True,
|
||||
allow_none=True,
|
||||
)
|
||||
|
||||
cleanup_checkout = Bool(
|
||||
False,
|
||||
help="""
|
||||
|
@ -395,26 +408,29 @@ class Repo2Docker(Application):
|
|||
"No matching content provider found for " "{url}.".format(url=url)
|
||||
)
|
||||
|
||||
swh_token = self.config.get("swh_token", self.swh_token)
|
||||
if swh_token and isinstance(picked_content_provider, contentproviders.Swhid):
|
||||
picked_content_provider.set_auth_token(swh_token)
|
||||
|
||||
for log_line in picked_content_provider.fetch(
|
||||
spec, checkout_path, yield_output=self.json_logs
|
||||
):
|
||||
self.log.info(log_line, extra=dict(phase="fetching"))
|
||||
|
||||
if not self.output_image_spec:
|
||||
self.output_image_spec = (
|
||||
"r2d" + escapism.escape(self.repo, escape_char="-").lower()
|
||||
)
|
||||
image_spec = "r2d" + self.repo
|
||||
# if we are building from a subdirectory include that in the
|
||||
# image name so we can tell builds from different sub-directories
|
||||
# apart.
|
||||
if self.subdir:
|
||||
self.output_image_spec += escapism.escape(
|
||||
self.subdir, escape_char="-"
|
||||
).lower()
|
||||
image_spec += self.subdir
|
||||
if picked_content_provider.content_id is not None:
|
||||
self.output_image_spec += picked_content_provider.content_id
|
||||
image_spec += picked_content_provider.content_id
|
||||
else:
|
||||
self.output_image_spec += str(int(time.time()))
|
||||
image_spec += str(int(time.time()))
|
||||
self.output_image_spec = escapism.escape(
|
||||
image_spec, escape_char="-"
|
||||
).lower()
|
||||
|
||||
def json_excepthook(self, etype, evalue, traceback):
|
||||
"""Called on an uncaught exception when using json logging
|
||||
|
|
|
@ -5,3 +5,4 @@ from .figshare import Figshare
|
|||
from .dataverse import Dataverse
|
||||
from .hydroshare import Hydroshare
|
||||
from .mercurial import Mercurial
|
||||
from .swhid import Swhid
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
import io
|
||||
import os
|
||||
import shutil
|
||||
import tarfile
|
||||
import time
|
||||
import re
|
||||
|
||||
from os import path
|
||||
|
||||
import requests
|
||||
|
||||
from .base import ContentProvider
|
||||
from ..utils import copytree
|
||||
from .. import __version__
|
||||
|
||||
|
||||
def parse_swhid(swhid):
|
||||
swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$"
|
||||
# only parse/check the <identifier_core> of the swhid
|
||||
# see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
|
||||
m = re.match(swhid_regexp, swhid.split(";")[0])
|
||||
if m:
|
||||
return m.groupdict()
|
||||
|
||||
|
||||
class Swhid(ContentProvider):
|
||||
"""Provide contents of a repository identified by a SWHID."""
|
||||
|
||||
retry_delay = 5
|
||||
|
||||
def __init__(self):
|
||||
self.swhid = None
|
||||
self.base_url = "https://archive.softwareheritage.org/api/1"
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update(
|
||||
{
|
||||
"user-agent": "repo2docker {}".format(__version__),
|
||||
}
|
||||
)
|
||||
|
||||
def set_auth_token(self, token):
|
||||
header = {"Authorization": "Bearer {}".format(token)}
|
||||
self.session.headers.update(header)
|
||||
|
||||
def _request(self, url, method="GET"):
|
||||
if not url.endswith("/"):
|
||||
url = url + "/"
|
||||
|
||||
for retries in range(3):
|
||||
try:
|
||||
resp = self.session.request(method, url)
|
||||
if resp.ok:
|
||||
break
|
||||
except requests.ConnectionError:
|
||||
time.sleep(self.retry_delay)
|
||||
|
||||
return resp
|
||||
|
||||
@property
|
||||
def content_id(self):
|
||||
"""The SWHID record ID used for content retrival"""
|
||||
return self.swhid
|
||||
|
||||
def detect(self, swhid, ref=None, extra_args=None):
|
||||
swhid_dict = parse_swhid(swhid)
|
||||
|
||||
if (
|
||||
swhid_dict
|
||||
and swhid_dict["type"] in ("dir", "rev")
|
||||
and swhid_dict["version"] == "1"
|
||||
):
|
||||
return {"swhid": swhid, "swhid_obj": swhid_dict}
|
||||
|
||||
def fetch_directory(self, dir_hash, output_dir):
|
||||
url = "{}/vault/directory/{}/".format(self.base_url, dir_hash)
|
||||
yield "Fetching directory {} from {}\n".format(dir_hash, url)
|
||||
resp = self._request(url, "POST")
|
||||
receipt = resp.json()
|
||||
status = receipt["status"]
|
||||
assert status != "failed", receipt
|
||||
while status not in ("failed", "done"):
|
||||
time.sleep(self.retry_delay)
|
||||
resp = self._request(url)
|
||||
status = resp.json()["status"]
|
||||
if status == "failed":
|
||||
yield "Error preparing the directory for download"
|
||||
raise Exception()
|
||||
resp = self._request(resp.json()["fetch_url"])
|
||||
archive = tarfile.open(fileobj=io.BytesIO(resp.content))
|
||||
archive.extractall(path=output_dir)
|
||||
# the output_dir should have only one subdir named after the dir_hash
|
||||
# move its content one level up
|
||||
copytree(path.join(output_dir, dir_hash), output_dir)
|
||||
shutil.rmtree(path.join(output_dir, dir_hash))
|
||||
yield "Fetched files: {}\n".format(os.listdir(output_dir))
|
||||
|
||||
def fetch(self, spec, output_dir, yield_output=False):
|
||||
swhid = spec["swhid"]
|
||||
swhid_obj = spec["swhid_obj"]
|
||||
|
||||
if swhid_obj["type"] == "rev":
|
||||
# need to get the directory for this revision
|
||||
sha1git = swhid_obj["hash"]
|
||||
url = "{}/revision/{}/".format(self.base_url, sha1git)
|
||||
yield "Fetching revision {} from {}\n".format(sha1git, url)
|
||||
resp = self._request(url)
|
||||
assert resp.ok, (resp.content, self.session.headers)
|
||||
directory = resp.json()["directory"]
|
||||
self.swhid = "swh:1:dir:{}".format(directory)
|
||||
yield from self.fetch_directory(directory, output_dir)
|
||||
elif swhid_obj["type"] == "dir":
|
||||
self.swhid = swhid
|
||||
yield from self.fetch_directory(swhid_obj["hash"], output_dir)
|
1
setup.py
1
setup.py
|
@ -56,6 +56,7 @@ setup(
|
|||
"ruamel.yaml>=0.15",
|
||||
"toml",
|
||||
"semver",
|
||||
"requests",
|
||||
],
|
||||
python_requires=">=3.6",
|
||||
author="Project Jupyter Contributors",
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
import json
|
||||
import os
|
||||
import io
|
||||
import tarfile
|
||||
import shutil
|
||||
import re
|
||||
import urllib
|
||||
import pytest
|
||||
import tempfile
|
||||
import logging
|
||||
import requests_mock
|
||||
|
||||
from os import makedirs
|
||||
from os.path import join
|
||||
from unittest.mock import patch, MagicMock, mock_open
|
||||
from zipfile import ZipFile
|
||||
|
||||
from repo2docker.contentproviders.swhid import Swhid, parse_swhid
|
||||
from repo2docker.contentproviders.base import ContentProviderException
|
||||
|
||||
|
||||
# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir().
|
||||
# We do not use this later to prevent having to depend on swh.model[cli]
|
||||
def swhid_of_dir(path):
|
||||
object = Directory.from_disk(path=path).get_data()
|
||||
return swhid(DIRECTORY, object)
|
||||
|
||||
|
||||
def test_content_id():
|
||||
swhid = Swhid()
|
||||
assert swhid.content_id is None
|
||||
|
||||
|
||||
swhids_ok = [
|
||||
"swh:1:dir:" + "0" * 40,
|
||||
"swh:1:rev:" + "0" * 40,
|
||||
]
|
||||
swhids_invalid = [
|
||||
"swh:1:dir:" + "0" * 39,
|
||||
"swh:2:dir:" + "0" * 40,
|
||||
"swh:1:rev:" + "0" * 41,
|
||||
"swh:1:cnt:" + "0" * 40,
|
||||
"swh:1:ori:" + "0" * 40,
|
||||
"swh:1:rel:" + "0" * 40,
|
||||
"swh:1:snp:" + "0" * 40,
|
||||
]
|
||||
|
||||
detect_values = [
|
||||
(swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok
|
||||
] + [(swhid, None) for swhid in swhids_invalid]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("swhid, expected", detect_values)
|
||||
def test_detect(swhid, expected):
|
||||
provider = Swhid()
|
||||
assert provider.detect(swhid) == expected
|
||||
|
||||
|
||||
def fake_urlopen(req):
|
||||
print(req)
|
||||
return req.headers
|
||||
|
||||
|
||||
def test_unresolving_swhid():
|
||||
provider = Swhid()
|
||||
|
||||
# swhid = "0" * 40
|
||||
# assert provider.swhid2url(swhid) is swhid
|
||||
|
||||
|
||||
NULLID = "0" * 40
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def gen_tarfile(tmpdir):
|
||||
rootdir = join(tmpdir, "tmp")
|
||||
makedirs(rootdir)
|
||||
with open(join(rootdir, "file1.txt"), "wb") as fobj:
|
||||
fobj.write(b"Some content\n")
|
||||
|
||||
# this directory hash can be computed using the swh.model package, but we do
|
||||
# nto want to depend on this later to limit dependencies and because it
|
||||
# does not support python 3.6;
|
||||
dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe"
|
||||
buf = io.BytesIO()
|
||||
tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w")
|
||||
tarf.add(rootdir, arcname=dirhash)
|
||||
tarf.close()
|
||||
shutil.rmtree(rootdir)
|
||||
return dirhash, buf.getvalue()
|
||||
|
||||
|
||||
def mocked_provider(tmpdir, dirhash, tarfile_buf):
|
||||
provider = Swhid()
|
||||
adapter = requests_mock.Adapter()
|
||||
provider.base_url = "mock://api/1"
|
||||
provider.retry_delay = 0.1
|
||||
provider.session.mount("mock://", adapter)
|
||||
|
||||
adapter.register_uri(
|
||||
"GET",
|
||||
"mock://api/1/revision/{}/".format(NULLID),
|
||||
json={
|
||||
"author": {"fullname": "John Doe <jdoe@example.com>"},
|
||||
"directory": dirhash,
|
||||
},
|
||||
)
|
||||
adapter.register_uri(
|
||||
"POST",
|
||||
"mock://api/1/vault/directory/{}/".format(dirhash),
|
||||
json={
|
||||
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
|
||||
"status": "new",
|
||||
},
|
||||
)
|
||||
adapter.register_uri(
|
||||
"GET",
|
||||
"mock://api/1/vault/directory/{}/".format(dirhash),
|
||||
[
|
||||
{
|
||||
"json": {
|
||||
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
|
||||
"status": "pending",
|
||||
}
|
||||
},
|
||||
{
|
||||
"json": {
|
||||
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
|
||||
"status": "done",
|
||||
}
|
||||
},
|
||||
],
|
||||
)
|
||||
adapter.register_uri(
|
||||
"GET",
|
||||
"mock://api/1/vault/directory/{}/raw/".format(dirhash),
|
||||
content=tarfile_buf,
|
||||
)
|
||||
return provider
|
||||
|
||||
|
||||
def test_fetch_revision(tmpdir, gen_tarfile):
|
||||
dir_id, tarfile_buf = gen_tarfile
|
||||
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
|
||||
swhid = "swh:1:rev:" + NULLID
|
||||
for log in provider.fetch(provider.detect(swhid), tmpdir):
|
||||
print(log)
|
||||
assert provider.content_id == "swh:1:dir:" + dir_id
|
||||
|
||||
|
||||
def test_fetch_directory(tmpdir, gen_tarfile):
|
||||
dir_id, tarfile_buf = gen_tarfile
|
||||
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
|
||||
swhid = "swh:1:dir:" + dir_id
|
||||
for log in provider.fetch(provider.detect(swhid), tmpdir):
|
||||
print(log)
|
||||
assert provider.content_id == swhid
|
Ładowanie…
Reference in New Issue