Merge pull request #988 from douardda/swhid

pull/1008/head
Tim Head 2021-01-26 13:56:29 +01:00 zatwierdzone przez GitHub
commit 1140dd1919
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
6 zmienionych plików z 301 dodań i 9 usunięć

Wyświetl plik

@ -14,6 +14,8 @@ Using ``repo2docker``
follows :ref:`specification`. repo2docker is called with the URL of a Git repository,
a `DOI <https://en.wikipedia.org/wiki/Digital_object_identifier>`_ from Zenodo or Figshare,
a `Handle <https://en.wikipedia.org/wiki/Handle_System>`_ or DOI from a Dataverse installation,
a `SWHID`_ of a directory of a revision archived in the
`Software Heritage Archive <https://archive.softwareheritage.org>`_,
or a path to a local directory.
It then performs these steps:
@ -36,7 +38,8 @@ repo2docker is called with this command::
where ``<source-repository>`` is:
* a URL of a Git repository (``https://github.com/binder-examples/requirements``),
* a Zenodo DOI (``10.5281/zenodo.1211089``), or
* a Zenodo DOI (``10.5281/zenodo.1211089``),
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or
* a path to a local directory (``a/local/directory``)
of the source repository you want to build.
@ -132,3 +135,4 @@ Command line API
.. _Pytudes: https://github.com/norvig/pytudes
.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html

Wyświetl plik

@ -148,6 +148,7 @@ class Repo2Docker(Application):
contentproviders.Figshare,
contentproviders.Dataverse,
contentproviders.Hydroshare,
contentproviders.Swhid,
contentproviders.Mercurial,
contentproviders.Git,
],
@ -269,6 +270,18 @@ class Repo2Docker(Application):
allow_none=True,
)
swh_token = Unicode(
None,
help="""
Token to use authenticated SWH API access.
If unset, default to unauthenticated (limited) usage of the Software
Heritage API.
""",
config=True,
allow_none=True,
)
cleanup_checkout = Bool(
False,
help="""
@ -395,26 +408,29 @@ class Repo2Docker(Application):
"No matching content provider found for " "{url}.".format(url=url)
)
swh_token = self.config.get("swh_token", self.swh_token)
if swh_token and isinstance(picked_content_provider, contentproviders.Swhid):
picked_content_provider.set_auth_token(swh_token)
for log_line in picked_content_provider.fetch(
spec, checkout_path, yield_output=self.json_logs
):
self.log.info(log_line, extra=dict(phase="fetching"))
if not self.output_image_spec:
self.output_image_spec = (
"r2d" + escapism.escape(self.repo, escape_char="-").lower()
)
image_spec = "r2d" + self.repo
# if we are building from a subdirectory include that in the
# image name so we can tell builds from different sub-directories
# apart.
if self.subdir:
self.output_image_spec += escapism.escape(
self.subdir, escape_char="-"
).lower()
image_spec += self.subdir
if picked_content_provider.content_id is not None:
self.output_image_spec += picked_content_provider.content_id
image_spec += picked_content_provider.content_id
else:
self.output_image_spec += str(int(time.time()))
image_spec += str(int(time.time()))
self.output_image_spec = escapism.escape(
image_spec, escape_char="-"
).lower()
def json_excepthook(self, etype, evalue, traceback):
"""Called on an uncaught exception when using json logging

Wyświetl plik

@ -5,3 +5,4 @@ from .figshare import Figshare
from .dataverse import Dataverse
from .hydroshare import Hydroshare
from .mercurial import Mercurial
from .swhid import Swhid

Wyświetl plik

@ -0,0 +1,113 @@
import io
import os
import shutil
import tarfile
import time
import re
from os import path
import requests
from .base import ContentProvider
from ..utils import copytree
from .. import __version__
def parse_swhid(swhid):
swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$"
# only parse/check the <identifier_core> of the swhid
# see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
m = re.match(swhid_regexp, swhid.split(";")[0])
if m:
return m.groupdict()
class Swhid(ContentProvider):
"""Provide contents of a repository identified by a SWHID."""
retry_delay = 5
def __init__(self):
self.swhid = None
self.base_url = "https://archive.softwareheritage.org/api/1"
self.session = requests.Session()
self.session.headers.update(
{
"user-agent": "repo2docker {}".format(__version__),
}
)
def set_auth_token(self, token):
header = {"Authorization": "Bearer {}".format(token)}
self.session.headers.update(header)
def _request(self, url, method="GET"):
if not url.endswith("/"):
url = url + "/"
for retries in range(3):
try:
resp = self.session.request(method, url)
if resp.ok:
break
except requests.ConnectionError:
time.sleep(self.retry_delay)
return resp
@property
def content_id(self):
"""The SWHID record ID used for content retrival"""
return self.swhid
def detect(self, swhid, ref=None, extra_args=None):
swhid_dict = parse_swhid(swhid)
if (
swhid_dict
and swhid_dict["type"] in ("dir", "rev")
and swhid_dict["version"] == "1"
):
return {"swhid": swhid, "swhid_obj": swhid_dict}
def fetch_directory(self, dir_hash, output_dir):
url = "{}/vault/directory/{}/".format(self.base_url, dir_hash)
yield "Fetching directory {} from {}\n".format(dir_hash, url)
resp = self._request(url, "POST")
receipt = resp.json()
status = receipt["status"]
assert status != "failed", receipt
while status not in ("failed", "done"):
time.sleep(self.retry_delay)
resp = self._request(url)
status = resp.json()["status"]
if status == "failed":
yield "Error preparing the directory for download"
raise Exception()
resp = self._request(resp.json()["fetch_url"])
archive = tarfile.open(fileobj=io.BytesIO(resp.content))
archive.extractall(path=output_dir)
# the output_dir should have only one subdir named after the dir_hash
# move its content one level up
copytree(path.join(output_dir, dir_hash), output_dir)
shutil.rmtree(path.join(output_dir, dir_hash))
yield "Fetched files: {}\n".format(os.listdir(output_dir))
def fetch(self, spec, output_dir, yield_output=False):
swhid = spec["swhid"]
swhid_obj = spec["swhid_obj"]
if swhid_obj["type"] == "rev":
# need to get the directory for this revision
sha1git = swhid_obj["hash"]
url = "{}/revision/{}/".format(self.base_url, sha1git)
yield "Fetching revision {} from {}\n".format(sha1git, url)
resp = self._request(url)
assert resp.ok, (resp.content, self.session.headers)
directory = resp.json()["directory"]
self.swhid = "swh:1:dir:{}".format(directory)
yield from self.fetch_directory(directory, output_dir)
elif swhid_obj["type"] == "dir":
self.swhid = swhid
yield from self.fetch_directory(swhid_obj["hash"], output_dir)

Wyświetl plik

@ -56,6 +56,7 @@ setup(
"ruamel.yaml>=0.15",
"toml",
"semver",
"requests",
],
python_requires=">=3.6",
author="Project Jupyter Contributors",

Wyświetl plik

@ -0,0 +1,157 @@
import json
import os
import io
import tarfile
import shutil
import re
import urllib
import pytest
import tempfile
import logging
import requests_mock
from os import makedirs
from os.path import join
from unittest.mock import patch, MagicMock, mock_open
from zipfile import ZipFile
from repo2docker.contentproviders.swhid import Swhid, parse_swhid
from repo2docker.contentproviders.base import ContentProviderException
# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir().
# We do not use this later to prevent having to depend on swh.model[cli]
def swhid_of_dir(path):
object = Directory.from_disk(path=path).get_data()
return swhid(DIRECTORY, object)
def test_content_id():
swhid = Swhid()
assert swhid.content_id is None
swhids_ok = [
"swh:1:dir:" + "0" * 40,
"swh:1:rev:" + "0" * 40,
]
swhids_invalid = [
"swh:1:dir:" + "0" * 39,
"swh:2:dir:" + "0" * 40,
"swh:1:rev:" + "0" * 41,
"swh:1:cnt:" + "0" * 40,
"swh:1:ori:" + "0" * 40,
"swh:1:rel:" + "0" * 40,
"swh:1:snp:" + "0" * 40,
]
detect_values = [
(swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok
] + [(swhid, None) for swhid in swhids_invalid]
@pytest.mark.parametrize("swhid, expected", detect_values)
def test_detect(swhid, expected):
provider = Swhid()
assert provider.detect(swhid) == expected
def fake_urlopen(req):
print(req)
return req.headers
def test_unresolving_swhid():
provider = Swhid()
# swhid = "0" * 40
# assert provider.swhid2url(swhid) is swhid
NULLID = "0" * 40
@pytest.fixture
def gen_tarfile(tmpdir):
rootdir = join(tmpdir, "tmp")
makedirs(rootdir)
with open(join(rootdir, "file1.txt"), "wb") as fobj:
fobj.write(b"Some content\n")
# this directory hash can be computed using the swh.model package, but we do
# nto want to depend on this later to limit dependencies and because it
# does not support python 3.6;
dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe"
buf = io.BytesIO()
tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w")
tarf.add(rootdir, arcname=dirhash)
tarf.close()
shutil.rmtree(rootdir)
return dirhash, buf.getvalue()
def mocked_provider(tmpdir, dirhash, tarfile_buf):
provider = Swhid()
adapter = requests_mock.Adapter()
provider.base_url = "mock://api/1"
provider.retry_delay = 0.1
provider.session.mount("mock://", adapter)
adapter.register_uri(
"GET",
"mock://api/1/revision/{}/".format(NULLID),
json={
"author": {"fullname": "John Doe <jdoe@example.com>"},
"directory": dirhash,
},
)
adapter.register_uri(
"POST",
"mock://api/1/vault/directory/{}/".format(dirhash),
json={
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "new",
},
)
adapter.register_uri(
"GET",
"mock://api/1/vault/directory/{}/".format(dirhash),
[
{
"json": {
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "pending",
}
},
{
"json": {
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "done",
}
},
],
)
adapter.register_uri(
"GET",
"mock://api/1/vault/directory/{}/raw/".format(dirhash),
content=tarfile_buf,
)
return provider
def test_fetch_revision(tmpdir, gen_tarfile):
dir_id, tarfile_buf = gen_tarfile
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
swhid = "swh:1:rev:" + NULLID
for log in provider.fetch(provider.detect(swhid), tmpdir):
print(log)
assert provider.content_id == "swh:1:dir:" + dir_id
def test_fetch_directory(tmpdir, gen_tarfile):
dir_id, tarfile_buf = gen_tarfile
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
swhid = "swh:1:dir:" + dir_id
for log in provider.fetch(provider.detect(swhid), tmpdir):
print(log)
assert provider.content_id == swhid