kopia lustrzana https://github.com/jupyterhub/repo2docker
749 wiersze
24 KiB
Python
749 wiersze
24 KiB
Python
"""repo2docker: convert git repositories into jupyter-suitable docker images
|
|
|
|
Images produced by repo2docker can be used with Jupyter notebooks standalone
|
|
or with BinderHub.
|
|
|
|
Usage:
|
|
|
|
python -m repo2docker https://github.com/you/your-repo
|
|
"""
|
|
import json
|
|
import sys
|
|
import logging
|
|
import os
|
|
import getpass
|
|
import shutil
|
|
import tempfile
|
|
import time
|
|
|
|
import docker
|
|
from urllib.parse import urlparse
|
|
from docker.utils import kwargs_from_env
|
|
from docker.errors import DockerException
|
|
import escapism
|
|
from pythonjsonlogger import jsonlogger
|
|
|
|
from traitlets import Any, Dict, Int, List, Unicode, Bool, default
|
|
from traitlets.config import Application
|
|
|
|
from . import __version__
|
|
from .buildpacks import (
|
|
CondaBuildPack,
|
|
DockerBuildPack,
|
|
JuliaProjectTomlBuildPack,
|
|
JuliaRequireBuildPack,
|
|
LegacyBinderDockerBuildPack,
|
|
NixBuildPack,
|
|
PipfileBuildPack,
|
|
PythonBuildPack,
|
|
RBuildPack,
|
|
)
|
|
from . import contentproviders
|
|
from .utils import ByteSpecification, chdir
|
|
|
|
|
|
class Repo2Docker(Application):
|
|
"""An application for converting git repositories to docker images"""
|
|
|
|
name = "jupyter-repo2docker"
|
|
version = __version__
|
|
description = __doc__
|
|
|
|
@default("log_level")
|
|
def _default_log_level(self):
|
|
"""The application's default log level"""
|
|
return logging.INFO
|
|
|
|
git_workdir = Unicode(
|
|
None,
|
|
config=True,
|
|
allow_none=True,
|
|
help="""
|
|
Working directory to use for check out of git repositories.
|
|
|
|
The default is to use the system's temporary directory. Should be
|
|
somewhere ephemeral, such as /tmp.
|
|
""",
|
|
)
|
|
|
|
subdir = Unicode(
|
|
"",
|
|
config=True,
|
|
help="""
|
|
Subdirectory of the git repository to examine.
|
|
|
|
Defaults to ''.
|
|
""",
|
|
)
|
|
|
|
cache_from = List(
|
|
[],
|
|
config=True,
|
|
help="""
|
|
List of images to try & re-use cached image layers from.
|
|
|
|
Docker only tries to re-use image layers from images built locally,
|
|
not pulled from a registry. We can ask it to explicitly re-use layers
|
|
from non-locally built images by through the 'cache_from' parameter.
|
|
""",
|
|
)
|
|
|
|
buildpacks = List(
|
|
[
|
|
LegacyBinderDockerBuildPack,
|
|
DockerBuildPack,
|
|
JuliaProjectTomlBuildPack,
|
|
JuliaRequireBuildPack,
|
|
NixBuildPack,
|
|
RBuildPack,
|
|
CondaBuildPack,
|
|
PipfileBuildPack,
|
|
PythonBuildPack,
|
|
],
|
|
config=True,
|
|
help="""
|
|
Ordered list of BuildPacks to try when building a git repository.
|
|
""",
|
|
)
|
|
|
|
extra_build_kwargs = Dict(
|
|
{},
|
|
help="""
|
|
extra kwargs to limit CPU quota when building a docker image.
|
|
Dictionary that allows the user to set the desired runtime flag
|
|
to configure the amount of access to CPU resources your container has.
|
|
Reference https://docs.docker.com/config/containers/resource_constraints/#cpu
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
extra_run_kwargs = Dict(
|
|
{},
|
|
help="""
|
|
extra kwargs to limit CPU quota when running a docker image.
|
|
Dictionary that allows the user to set the desired runtime flag
|
|
to configure the amount of access to CPU resources your container has.
|
|
Reference https://docs.docker.com/config/containers/resource_constraints/#cpu
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
default_buildpack = Any(
|
|
PythonBuildPack,
|
|
config=True,
|
|
help="""
|
|
The default build pack to use when no other buildpacks are found.
|
|
""",
|
|
)
|
|
|
|
# Git is our content provider of last resort. This is to maintain the
|
|
# old behaviour when git and local directories were the only supported
|
|
# content providers. We can detect local directories from the path, but
|
|
# detecting if something will successfully `git clone` is very hard if all
|
|
# you can do is look at the path/URL to it.
|
|
content_providers = List(
|
|
[
|
|
contentproviders.Local,
|
|
contentproviders.Zenodo,
|
|
contentproviders.Figshare,
|
|
contentproviders.Dataverse,
|
|
contentproviders.Git,
|
|
],
|
|
config=True,
|
|
help="""
|
|
Ordered list by priority of ContentProviders to try in turn to fetch
|
|
the contents specified by the user.
|
|
""",
|
|
)
|
|
|
|
build_memory_limit = ByteSpecification(
|
|
0,
|
|
help="""
|
|
Total memory that can be used by the docker image building process.
|
|
|
|
Set to 0 for no limits.
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
volumes = Dict(
|
|
{},
|
|
help="""
|
|
Volumes to mount when running the container.
|
|
|
|
Only used when running, not during build process!
|
|
|
|
Use a key-value pair, with the key being the volume source &
|
|
value being the destination volume.
|
|
|
|
Both source and destination can be relative. Source is resolved
|
|
relative to the current working directory on the host, and
|
|
destination is resolved relative to the working directory of the
|
|
image - ($HOME by default)
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
user_id = Int(
|
|
help="""
|
|
UID of the user to create inside the built image.
|
|
|
|
Should be a uid that is not currently used by anything in the image.
|
|
Defaults to uid of currently running user, since that is the most
|
|
common case when running r2d manually.
|
|
|
|
Might not affect Dockerfile builds.
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
@default("user_id")
|
|
def _user_id_default(self):
|
|
"""
|
|
Default user_id to current running user.
|
|
"""
|
|
return os.geteuid()
|
|
|
|
user_name = Unicode(
|
|
"jovyan",
|
|
help="""
|
|
Username of the user to create inside the built image.
|
|
|
|
Should be a username that is not currently used by anything in the
|
|
image, and should conform to the restrictions on user names for Linux.
|
|
|
|
Defaults to username of currently running user, since that is the most
|
|
common case when running repo2docker manually.
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
@default("user_name")
|
|
def _user_name_default(self):
|
|
"""
|
|
Default user_name to current running user.
|
|
"""
|
|
return getpass.getuser()
|
|
|
|
appendix = Unicode(
|
|
config=True,
|
|
help="""
|
|
Appendix of Dockerfile commands to run at the end of the build.
|
|
|
|
Can be used to customize the resulting image after all
|
|
standard build steps finish.
|
|
""",
|
|
)
|
|
|
|
json_logs = Bool(
|
|
False,
|
|
help="""
|
|
Log output in structured JSON format.
|
|
|
|
Useful when stdout is consumed by other tools
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
repo = Unicode(
|
|
".",
|
|
help="""
|
|
Specification of repository to build image for.
|
|
|
|
Could be local path or git URL.
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
ref = Unicode(
|
|
None,
|
|
help="""
|
|
Git ref that should be built.
|
|
|
|
If repo is a git repository, this ref is checked out
|
|
in a local clone before repository is built.
|
|
""",
|
|
config=True,
|
|
allow_none=True,
|
|
)
|
|
|
|
cleanup_checkout = Bool(
|
|
False,
|
|
help="""
|
|
Delete source repository after building is done.
|
|
|
|
Useful when repo2docker is doing the git cloning
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
output_image_spec = Unicode(
|
|
"",
|
|
help="""
|
|
Docker Image name:tag to tag the built image with.
|
|
|
|
Required parameter.
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
push = Bool(
|
|
False,
|
|
help="""
|
|
Set to true to push docker image after building
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
run = Bool(
|
|
False,
|
|
help="""
|
|
Run docker image after building
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
# FIXME: Refactor class to be able to do --no-build without needing
|
|
# deep support for it inside other code
|
|
dry_run = Bool(
|
|
False,
|
|
help="""
|
|
Do not actually build the docker image, just simulate it.
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
# FIXME: Refactor classes to separate build & run steps
|
|
run_cmd = List(
|
|
[],
|
|
help="""
|
|
Command to run when running the container
|
|
|
|
When left empty, a jupyter notebook is run.
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
all_ports = Bool(
|
|
False,
|
|
help="""
|
|
Publish all declared ports from container whiel running.
|
|
|
|
Equivalent to -P option to docker run
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
ports = Dict(
|
|
{},
|
|
help="""
|
|
Port mappings to establish when running the container.
|
|
|
|
Equivalent to -p {key}:{value} options to docker run.
|
|
{key} refers to port inside container, and {value}
|
|
refers to port / host:port in the host
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
environment = List(
|
|
[],
|
|
help="""
|
|
Environment variables to set when running the built image.
|
|
|
|
Each item must be a string formatted as KEY=VALUE
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
target_repo_dir = Unicode(
|
|
"",
|
|
help="""
|
|
Path inside the image where contents of the repositories are copied to,
|
|
and where all the build operations (such as postBuild) happen.
|
|
|
|
Defaults to ${HOME} if not set
|
|
""",
|
|
config=True,
|
|
)
|
|
|
|
def fetch(self, url, ref, checkout_path):
|
|
"""Fetch the contents of `url` and place it in `checkout_path`.
|
|
|
|
The `ref` parameter specifies what "version" of the contents should be
|
|
fetched. In the case of a git repository `ref` is the SHA-1 of a commit.
|
|
|
|
Iterate through possible content providers until a valid provider,
|
|
based on URL, is found.
|
|
"""
|
|
picked_content_provider = None
|
|
for ContentProvider in self.content_providers:
|
|
cp = ContentProvider()
|
|
spec = cp.detect(url, ref=ref)
|
|
if spec is not None:
|
|
picked_content_provider = cp
|
|
self.log.info(
|
|
"Picked {cp} content "
|
|
"provider.\n".format(cp=cp.__class__.__name__)
|
|
)
|
|
break
|
|
|
|
if picked_content_provider is None:
|
|
self.log.error(
|
|
"No matching content provider found for " "{url}.".format(url=url)
|
|
)
|
|
|
|
for log_line in picked_content_provider.fetch(
|
|
spec, checkout_path, yield_output=self.json_logs
|
|
):
|
|
self.log.info(log_line, extra=dict(phase="fetching"))
|
|
|
|
if not self.output_image_spec:
|
|
self.output_image_spec = (
|
|
"r2d" + escapism.escape(self.repo, escape_char="-").lower()
|
|
)
|
|
# if we are building from a subdirectory include that in the
|
|
# image name so we can tell builds from different sub-directories
|
|
# apart.
|
|
if self.subdir:
|
|
self.output_image_spec += escapism.escape(
|
|
self.subdir, escape_char="-"
|
|
).lower()
|
|
if picked_content_provider.content_id is not None:
|
|
self.output_image_spec += picked_content_provider.content_id
|
|
else:
|
|
self.output_image_spec += str(int(time.time()))
|
|
|
|
def json_excepthook(self, etype, evalue, traceback):
|
|
"""Called on an uncaught exception when using json logging
|
|
|
|
Avoids non-JSON output on errors when using --json-logs
|
|
"""
|
|
self.log.error(
|
|
"Error during build: %s",
|
|
evalue,
|
|
exc_info=(etype, evalue, traceback),
|
|
extra=dict(phase="failed"),
|
|
)
|
|
|
|
def initialize(self):
|
|
"""Init repo2docker configuration before start"""
|
|
# FIXME: Remove this function, move it to setters / traitlet reactors
|
|
if self.json_logs:
|
|
# register JSON excepthook to avoid non-JSON output on errors
|
|
sys.excepthook = self.json_excepthook
|
|
# Need to reset existing handlers, or we repeat messages
|
|
logHandler = logging.StreamHandler()
|
|
formatter = jsonlogger.JsonFormatter()
|
|
logHandler.setFormatter(formatter)
|
|
self.log = logging.getLogger("repo2docker")
|
|
self.log.handlers = []
|
|
self.log.addHandler(logHandler)
|
|
self.log.setLevel(self.log_level)
|
|
else:
|
|
# due to json logger stuff above,
|
|
# our log messages include carriage returns, newlines, etc.
|
|
# remove the additional newline from the stream handler
|
|
self.log.handlers[0].terminator = ""
|
|
# We don't want a [Repo2Docker] on all messages
|
|
self.log.handlers[0].formatter = logging.Formatter(fmt="%(message)s")
|
|
|
|
if self.dry_run and (self.run or self.push):
|
|
raise ValueError("Cannot push or run image if we are not building it")
|
|
|
|
if self.volumes and not self.run:
|
|
raise ValueError("Cannot mount volumes if container is not run")
|
|
|
|
def push_image(self):
|
|
"""Push docker image to registry"""
|
|
client = docker.APIClient(version="auto", **kwargs_from_env())
|
|
# Build a progress setup for each layer, and only emit per-layer
|
|
# info every 1.5s
|
|
progress_layers = {}
|
|
layers = {}
|
|
last_emit_time = time.time()
|
|
for chunk in client.push(self.output_image_spec, stream=True):
|
|
# each chunk can be one or more lines of json events
|
|
# split lines here in case multiple are delivered at once
|
|
for line in chunk.splitlines():
|
|
line = line.decode("utf-8", errors="replace")
|
|
try:
|
|
progress = json.loads(line)
|
|
except Exception as e:
|
|
self.log.warning("Not a JSON progress line: %r", line)
|
|
continue
|
|
if "error" in progress:
|
|
self.log.error(progress["error"], extra=dict(phase="failed"))
|
|
raise docker.errors.ImageLoadError(progress["error"])
|
|
if "id" not in progress:
|
|
continue
|
|
# deprecated truncated-progress data
|
|
if "progressDetail" in progress and progress["progressDetail"]:
|
|
progress_layers[progress["id"]] = progress["progressDetail"]
|
|
else:
|
|
progress_layers[progress["id"]] = progress["status"]
|
|
# include full progress data for each layer in 'layers' data
|
|
layers[progress["id"]] = progress
|
|
if time.time() - last_emit_time > 1.5:
|
|
self.log.info(
|
|
"Pushing image\n",
|
|
extra=dict(
|
|
progress=progress_layers, layers=layers, phase="pushing"
|
|
),
|
|
)
|
|
last_emit_time = time.time()
|
|
self.log.info(
|
|
"Successfully pushed {}".format(self.output_image_spec),
|
|
extra=dict(phase="pushing"),
|
|
)
|
|
|
|
def run_image(self):
|
|
"""Run docker container from built image
|
|
|
|
and wait for it to finish.
|
|
"""
|
|
container = self.start_container()
|
|
self.wait_for_container(container)
|
|
|
|
def start_container(self):
|
|
"""Start docker container from built image
|
|
|
|
Returns running container
|
|
"""
|
|
client = docker.from_env(version="auto")
|
|
|
|
docker_host = os.environ.get("DOCKER_HOST")
|
|
if docker_host:
|
|
host_name = urlparse(docker_host).hostname
|
|
else:
|
|
host_name = "127.0.0.1"
|
|
self.hostname = host_name
|
|
|
|
if not self.run_cmd:
|
|
port = str(self._get_free_port())
|
|
self.port = port
|
|
# To use the option --NotebookApp.custom_display_url
|
|
# make sure the base-notebook image is updated:
|
|
# docker pull jupyter/base-notebook
|
|
run_cmd = [
|
|
"jupyter",
|
|
"notebook",
|
|
"--ip",
|
|
"0.0.0.0",
|
|
"--port",
|
|
port,
|
|
"--NotebookApp.custom_display_url=http://{}:{}".format(host_name, port),
|
|
]
|
|
ports = {"%s/tcp" % port: port}
|
|
else:
|
|
# run_cmd given by user, if port is also given then pass it on
|
|
run_cmd = self.run_cmd
|
|
if self.ports:
|
|
ports = self.ports
|
|
else:
|
|
ports = {}
|
|
# store ports on self so they can be retrieved in tests
|
|
self.ports = ports
|
|
|
|
container_volumes = {}
|
|
if self.volumes:
|
|
api_client = docker.APIClient(
|
|
version="auto", **docker.utils.kwargs_from_env()
|
|
)
|
|
image = api_client.inspect_image(self.output_image_spec)
|
|
image_workdir = image["ContainerConfig"]["WorkingDir"]
|
|
|
|
for k, v in self.volumes.items():
|
|
container_volumes[os.path.abspath(k)] = {
|
|
"bind": v if v.startswith("/") else os.path.join(image_workdir, v),
|
|
"mode": "rw",
|
|
}
|
|
|
|
run_kwargs = dict(
|
|
publish_all_ports=self.all_ports,
|
|
ports=ports,
|
|
detach=True,
|
|
command=run_cmd,
|
|
volumes=container_volumes,
|
|
environment=self.environment,
|
|
)
|
|
|
|
run_kwargs.update(self.extra_run_kwargs)
|
|
|
|
container = client.containers.run(self.output_image_spec, **run_kwargs)
|
|
|
|
while container.status == "created":
|
|
time.sleep(0.5)
|
|
container.reload()
|
|
|
|
return container
|
|
|
|
def wait_for_container(self, container):
|
|
"""Wait for a container to finish
|
|
|
|
Displaying logs while it's running
|
|
"""
|
|
|
|
try:
|
|
for line in container.logs(stream=True):
|
|
self.log.info(line.decode("utf-8"), extra=dict(phase="running"))
|
|
finally:
|
|
container.reload()
|
|
if container.status == "running":
|
|
self.log.info("Stopping container...\n", extra=dict(phase="running"))
|
|
container.kill()
|
|
exit_code = container.attrs["State"]["ExitCode"]
|
|
container.remove()
|
|
if exit_code:
|
|
sys.exit(exit_code)
|
|
|
|
def _get_free_port(self):
|
|
"""
|
|
Hacky method to get a free random port on local host
|
|
"""
|
|
import socket
|
|
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
s.bind(("", 0))
|
|
port = s.getsockname()[1]
|
|
s.close()
|
|
return port
|
|
|
|
def find_image(self):
|
|
# if this is a dry run it is Ok for dockerd to be unreachable so we
|
|
# always return False for dry runs.
|
|
if self.dry_run:
|
|
return False
|
|
# check if we already have an image for this content
|
|
client = docker.APIClient(version="auto", **kwargs_from_env())
|
|
for image in client.images():
|
|
if image["RepoTags"] is not None:
|
|
for tag in image["RepoTags"]:
|
|
if tag == self.output_image_spec + ":latest":
|
|
return True
|
|
return False
|
|
|
|
def build(self):
|
|
"""
|
|
Build docker image
|
|
"""
|
|
# Check if r2d can connect to docker daemon
|
|
if not self.dry_run:
|
|
try:
|
|
docker_client = docker.APIClient(version="auto", **kwargs_from_env())
|
|
except DockerException as e:
|
|
self.log.error(
|
|
"\nDocker client initialization error: %s.\nCheck if docker is running on the host.\n",
|
|
e,
|
|
)
|
|
self.exit(1)
|
|
|
|
# If the source to be executed is a directory, continue using the
|
|
# directory. In the case of a local directory, it is used as both the
|
|
# source and target. Reusing a local directory seems better than
|
|
# making a copy of it as it might contain large files that would be
|
|
# expensive to copy.
|
|
if os.path.isdir(self.repo):
|
|
checkout_path = self.repo
|
|
else:
|
|
if self.git_workdir is None:
|
|
checkout_path = tempfile.mkdtemp(prefix="repo2docker")
|
|
else:
|
|
checkout_path = self.git_workdir
|
|
|
|
try:
|
|
self.fetch(self.repo, self.ref, checkout_path)
|
|
|
|
if self.find_image():
|
|
self.log.info(
|
|
"Reusing existing image ({}), not "
|
|
"building.".format(self.output_image_spec)
|
|
)
|
|
# no need to build, so skip to the end by `return`ing here
|
|
# this will still execute the finally clause and let's us
|
|
# avoid having to indent the build code by an extra level
|
|
return
|
|
|
|
if self.subdir:
|
|
checkout_path = os.path.join(checkout_path, self.subdir)
|
|
if not os.path.isdir(checkout_path):
|
|
self.log.error(
|
|
"Subdirectory %s does not exist",
|
|
self.subdir,
|
|
extra=dict(phase="failure"),
|
|
)
|
|
raise FileNotFoundError("Could not find {}".format(checkout_path))
|
|
|
|
with chdir(checkout_path):
|
|
for BP in self.buildpacks:
|
|
bp = BP()
|
|
if bp.detect():
|
|
picked_buildpack = bp
|
|
break
|
|
else:
|
|
picked_buildpack = self.default_buildpack()
|
|
|
|
picked_buildpack.appendix = self.appendix
|
|
# Add metadata labels
|
|
picked_buildpack.labels["repo2docker.version"] = self.version
|
|
repo_label = "local" if os.path.isdir(self.repo) else self.repo
|
|
picked_buildpack.labels["repo2docker.repo"] = repo_label
|
|
picked_buildpack.labels["repo2docker.ref"] = self.ref
|
|
|
|
if self.dry_run:
|
|
print(picked_buildpack.render())
|
|
else:
|
|
self.log.debug(
|
|
picked_buildpack.render(), extra=dict(phase="building")
|
|
)
|
|
if self.user_id == 0:
|
|
raise ValueError(
|
|
"Root as the primary user in the image is not permitted."
|
|
)
|
|
|
|
build_args = {
|
|
"NB_USER": self.user_name,
|
|
"NB_UID": str(self.user_id),
|
|
}
|
|
if self.target_repo_dir:
|
|
build_args["REPO_DIR"] = self.target_repo_dir
|
|
self.log.info(
|
|
"Using %s builder\n",
|
|
bp.__class__.__name__,
|
|
extra=dict(phase="building"),
|
|
)
|
|
|
|
for l in picked_buildpack.build(
|
|
docker_client,
|
|
self.output_image_spec,
|
|
self.build_memory_limit,
|
|
build_args,
|
|
self.cache_from,
|
|
self.extra_build_kwargs,
|
|
):
|
|
if "stream" in l:
|
|
self.log.info(l["stream"], extra=dict(phase="building"))
|
|
elif "error" in l:
|
|
self.log.info(l["error"], extra=dict(phase="failure"))
|
|
raise docker.errors.BuildError(l["error"], build_log="")
|
|
elif "status" in l:
|
|
self.log.info(
|
|
"Fetching base image...\r", extra=dict(phase="building")
|
|
)
|
|
else:
|
|
self.log.info(json.dumps(l), extra=dict(phase="building"))
|
|
|
|
finally:
|
|
# Cleanup checkout if necessary
|
|
if self.cleanup_checkout:
|
|
shutil.rmtree(checkout_path, ignore_errors=True)
|
|
|
|
def start(self):
|
|
self.build()
|
|
|
|
if self.push:
|
|
self.push_image()
|
|
|
|
if self.run:
|
|
self.run_image()
|