repo2docker/repo2docker/app.py

749 wiersze
24 KiB
Python

"""repo2docker: convert git repositories into jupyter-suitable docker images
Images produced by repo2docker can be used with Jupyter notebooks standalone
or with BinderHub.
Usage:
python -m repo2docker https://github.com/you/your-repo
"""
import json
import sys
import logging
import os
import getpass
import shutil
import tempfile
import time
import docker
from urllib.parse import urlparse
from docker.utils import kwargs_from_env
from docker.errors import DockerException
import escapism
from pythonjsonlogger import jsonlogger
from traitlets import Any, Dict, Int, List, Unicode, Bool, default
from traitlets.config import Application
from . import __version__
from .buildpacks import (
CondaBuildPack,
DockerBuildPack,
JuliaProjectTomlBuildPack,
JuliaRequireBuildPack,
LegacyBinderDockerBuildPack,
NixBuildPack,
PipfileBuildPack,
PythonBuildPack,
RBuildPack,
)
from . import contentproviders
from .utils import ByteSpecification, chdir
class Repo2Docker(Application):
"""An application for converting git repositories to docker images"""
name = "jupyter-repo2docker"
version = __version__
description = __doc__
@default("log_level")
def _default_log_level(self):
"""The application's default log level"""
return logging.INFO
git_workdir = Unicode(
None,
config=True,
allow_none=True,
help="""
Working directory to use for check out of git repositories.
The default is to use the system's temporary directory. Should be
somewhere ephemeral, such as /tmp.
""",
)
subdir = Unicode(
"",
config=True,
help="""
Subdirectory of the git repository to examine.
Defaults to ''.
""",
)
cache_from = List(
[],
config=True,
help="""
List of images to try & re-use cached image layers from.
Docker only tries to re-use image layers from images built locally,
not pulled from a registry. We can ask it to explicitly re-use layers
from non-locally built images by through the 'cache_from' parameter.
""",
)
buildpacks = List(
[
LegacyBinderDockerBuildPack,
DockerBuildPack,
JuliaProjectTomlBuildPack,
JuliaRequireBuildPack,
NixBuildPack,
RBuildPack,
CondaBuildPack,
PipfileBuildPack,
PythonBuildPack,
],
config=True,
help="""
Ordered list of BuildPacks to try when building a git repository.
""",
)
extra_build_kwargs = Dict(
{},
help="""
extra kwargs to limit CPU quota when building a docker image.
Dictionary that allows the user to set the desired runtime flag
to configure the amount of access to CPU resources your container has.
Reference https://docs.docker.com/config/containers/resource_constraints/#cpu
""",
config=True,
)
extra_run_kwargs = Dict(
{},
help="""
extra kwargs to limit CPU quota when running a docker image.
Dictionary that allows the user to set the desired runtime flag
to configure the amount of access to CPU resources your container has.
Reference https://docs.docker.com/config/containers/resource_constraints/#cpu
""",
config=True,
)
default_buildpack = Any(
PythonBuildPack,
config=True,
help="""
The default build pack to use when no other buildpacks are found.
""",
)
# Git is our content provider of last resort. This is to maintain the
# old behaviour when git and local directories were the only supported
# content providers. We can detect local directories from the path, but
# detecting if something will successfully `git clone` is very hard if all
# you can do is look at the path/URL to it.
content_providers = List(
[
contentproviders.Local,
contentproviders.Zenodo,
contentproviders.Figshare,
contentproviders.Dataverse,
contentproviders.Git,
],
config=True,
help="""
Ordered list by priority of ContentProviders to try in turn to fetch
the contents specified by the user.
""",
)
build_memory_limit = ByteSpecification(
0,
help="""
Total memory that can be used by the docker image building process.
Set to 0 for no limits.
""",
config=True,
)
volumes = Dict(
{},
help="""
Volumes to mount when running the container.
Only used when running, not during build process!
Use a key-value pair, with the key being the volume source &
value being the destination volume.
Both source and destination can be relative. Source is resolved
relative to the current working directory on the host, and
destination is resolved relative to the working directory of the
image - ($HOME by default)
""",
config=True,
)
user_id = Int(
help="""
UID of the user to create inside the built image.
Should be a uid that is not currently used by anything in the image.
Defaults to uid of currently running user, since that is the most
common case when running r2d manually.
Might not affect Dockerfile builds.
""",
config=True,
)
@default("user_id")
def _user_id_default(self):
"""
Default user_id to current running user.
"""
return os.geteuid()
user_name = Unicode(
"jovyan",
help="""
Username of the user to create inside the built image.
Should be a username that is not currently used by anything in the
image, and should conform to the restrictions on user names for Linux.
Defaults to username of currently running user, since that is the most
common case when running repo2docker manually.
""",
config=True,
)
@default("user_name")
def _user_name_default(self):
"""
Default user_name to current running user.
"""
return getpass.getuser()
appendix = Unicode(
config=True,
help="""
Appendix of Dockerfile commands to run at the end of the build.
Can be used to customize the resulting image after all
standard build steps finish.
""",
)
json_logs = Bool(
False,
help="""
Log output in structured JSON format.
Useful when stdout is consumed by other tools
""",
config=True,
)
repo = Unicode(
".",
help="""
Specification of repository to build image for.
Could be local path or git URL.
""",
config=True,
)
ref = Unicode(
None,
help="""
Git ref that should be built.
If repo is a git repository, this ref is checked out
in a local clone before repository is built.
""",
config=True,
allow_none=True,
)
cleanup_checkout = Bool(
False,
help="""
Delete source repository after building is done.
Useful when repo2docker is doing the git cloning
""",
config=True,
)
output_image_spec = Unicode(
"",
help="""
Docker Image name:tag to tag the built image with.
Required parameter.
""",
config=True,
)
push = Bool(
False,
help="""
Set to true to push docker image after building
""",
config=True,
)
run = Bool(
False,
help="""
Run docker image after building
""",
config=True,
)
# FIXME: Refactor class to be able to do --no-build without needing
# deep support for it inside other code
dry_run = Bool(
False,
help="""
Do not actually build the docker image, just simulate it.
""",
config=True,
)
# FIXME: Refactor classes to separate build & run steps
run_cmd = List(
[],
help="""
Command to run when running the container
When left empty, a jupyter notebook is run.
""",
config=True,
)
all_ports = Bool(
False,
help="""
Publish all declared ports from container whiel running.
Equivalent to -P option to docker run
""",
config=True,
)
ports = Dict(
{},
help="""
Port mappings to establish when running the container.
Equivalent to -p {key}:{value} options to docker run.
{key} refers to port inside container, and {value}
refers to port / host:port in the host
""",
config=True,
)
environment = List(
[],
help="""
Environment variables to set when running the built image.
Each item must be a string formatted as KEY=VALUE
""",
config=True,
)
target_repo_dir = Unicode(
"",
help="""
Path inside the image where contents of the repositories are copied to,
and where all the build operations (such as postBuild) happen.
Defaults to ${HOME} if not set
""",
config=True,
)
def fetch(self, url, ref, checkout_path):
"""Fetch the contents of `url` and place it in `checkout_path`.
The `ref` parameter specifies what "version" of the contents should be
fetched. In the case of a git repository `ref` is the SHA-1 of a commit.
Iterate through possible content providers until a valid provider,
based on URL, is found.
"""
picked_content_provider = None
for ContentProvider in self.content_providers:
cp = ContentProvider()
spec = cp.detect(url, ref=ref)
if spec is not None:
picked_content_provider = cp
self.log.info(
"Picked {cp} content "
"provider.\n".format(cp=cp.__class__.__name__)
)
break
if picked_content_provider is None:
self.log.error(
"No matching content provider found for " "{url}.".format(url=url)
)
for log_line in picked_content_provider.fetch(
spec, checkout_path, yield_output=self.json_logs
):
self.log.info(log_line, extra=dict(phase="fetching"))
if not self.output_image_spec:
self.output_image_spec = (
"r2d" + escapism.escape(self.repo, escape_char="-").lower()
)
# if we are building from a subdirectory include that in the
# image name so we can tell builds from different sub-directories
# apart.
if self.subdir:
self.output_image_spec += escapism.escape(
self.subdir, escape_char="-"
).lower()
if picked_content_provider.content_id is not None:
self.output_image_spec += picked_content_provider.content_id
else:
self.output_image_spec += str(int(time.time()))
def json_excepthook(self, etype, evalue, traceback):
"""Called on an uncaught exception when using json logging
Avoids non-JSON output on errors when using --json-logs
"""
self.log.error(
"Error during build: %s",
evalue,
exc_info=(etype, evalue, traceback),
extra=dict(phase="failed"),
)
def initialize(self):
"""Init repo2docker configuration before start"""
# FIXME: Remove this function, move it to setters / traitlet reactors
if self.json_logs:
# register JSON excepthook to avoid non-JSON output on errors
sys.excepthook = self.json_excepthook
# Need to reset existing handlers, or we repeat messages
logHandler = logging.StreamHandler()
formatter = jsonlogger.JsonFormatter()
logHandler.setFormatter(formatter)
self.log = logging.getLogger("repo2docker")
self.log.handlers = []
self.log.addHandler(logHandler)
self.log.setLevel(self.log_level)
else:
# due to json logger stuff above,
# our log messages include carriage returns, newlines, etc.
# remove the additional newline from the stream handler
self.log.handlers[0].terminator = ""
# We don't want a [Repo2Docker] on all messages
self.log.handlers[0].formatter = logging.Formatter(fmt="%(message)s")
if self.dry_run and (self.run or self.push):
raise ValueError("Cannot push or run image if we are not building it")
if self.volumes and not self.run:
raise ValueError("Cannot mount volumes if container is not run")
def push_image(self):
"""Push docker image to registry"""
client = docker.APIClient(version="auto", **kwargs_from_env())
# Build a progress setup for each layer, and only emit per-layer
# info every 1.5s
progress_layers = {}
layers = {}
last_emit_time = time.time()
for chunk in client.push(self.output_image_spec, stream=True):
# each chunk can be one or more lines of json events
# split lines here in case multiple are delivered at once
for line in chunk.splitlines():
line = line.decode("utf-8", errors="replace")
try:
progress = json.loads(line)
except Exception as e:
self.log.warning("Not a JSON progress line: %r", line)
continue
if "error" in progress:
self.log.error(progress["error"], extra=dict(phase="failed"))
raise docker.errors.ImageLoadError(progress["error"])
if "id" not in progress:
continue
# deprecated truncated-progress data
if "progressDetail" in progress and progress["progressDetail"]:
progress_layers[progress["id"]] = progress["progressDetail"]
else:
progress_layers[progress["id"]] = progress["status"]
# include full progress data for each layer in 'layers' data
layers[progress["id"]] = progress
if time.time() - last_emit_time > 1.5:
self.log.info(
"Pushing image\n",
extra=dict(
progress=progress_layers, layers=layers, phase="pushing"
),
)
last_emit_time = time.time()
self.log.info(
"Successfully pushed {}".format(self.output_image_spec),
extra=dict(phase="pushing"),
)
def run_image(self):
"""Run docker container from built image
and wait for it to finish.
"""
container = self.start_container()
self.wait_for_container(container)
def start_container(self):
"""Start docker container from built image
Returns running container
"""
client = docker.from_env(version="auto")
docker_host = os.environ.get("DOCKER_HOST")
if docker_host:
host_name = urlparse(docker_host).hostname
else:
host_name = "127.0.0.1"
self.hostname = host_name
if not self.run_cmd:
port = str(self._get_free_port())
self.port = port
# To use the option --NotebookApp.custom_display_url
# make sure the base-notebook image is updated:
# docker pull jupyter/base-notebook
run_cmd = [
"jupyter",
"notebook",
"--ip",
"0.0.0.0",
"--port",
port,
"--NotebookApp.custom_display_url=http://{}:{}".format(host_name, port),
]
ports = {"%s/tcp" % port: port}
else:
# run_cmd given by user, if port is also given then pass it on
run_cmd = self.run_cmd
if self.ports:
ports = self.ports
else:
ports = {}
# store ports on self so they can be retrieved in tests
self.ports = ports
container_volumes = {}
if self.volumes:
api_client = docker.APIClient(
version="auto", **docker.utils.kwargs_from_env()
)
image = api_client.inspect_image(self.output_image_spec)
image_workdir = image["ContainerConfig"]["WorkingDir"]
for k, v in self.volumes.items():
container_volumes[os.path.abspath(k)] = {
"bind": v if v.startswith("/") else os.path.join(image_workdir, v),
"mode": "rw",
}
run_kwargs = dict(
publish_all_ports=self.all_ports,
ports=ports,
detach=True,
command=run_cmd,
volumes=container_volumes,
environment=self.environment,
)
run_kwargs.update(self.extra_run_kwargs)
container = client.containers.run(self.output_image_spec, **run_kwargs)
while container.status == "created":
time.sleep(0.5)
container.reload()
return container
def wait_for_container(self, container):
"""Wait for a container to finish
Displaying logs while it's running
"""
try:
for line in container.logs(stream=True):
self.log.info(line.decode("utf-8"), extra=dict(phase="running"))
finally:
container.reload()
if container.status == "running":
self.log.info("Stopping container...\n", extra=dict(phase="running"))
container.kill()
exit_code = container.attrs["State"]["ExitCode"]
container.remove()
if exit_code:
sys.exit(exit_code)
def _get_free_port(self):
"""
Hacky method to get a free random port on local host
"""
import socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("", 0))
port = s.getsockname()[1]
s.close()
return port
def find_image(self):
# if this is a dry run it is Ok for dockerd to be unreachable so we
# always return False for dry runs.
if self.dry_run:
return False
# check if we already have an image for this content
client = docker.APIClient(version="auto", **kwargs_from_env())
for image in client.images():
if image["RepoTags"] is not None:
for tag in image["RepoTags"]:
if tag == self.output_image_spec + ":latest":
return True
return False
def build(self):
"""
Build docker image
"""
# Check if r2d can connect to docker daemon
if not self.dry_run:
try:
docker_client = docker.APIClient(version="auto", **kwargs_from_env())
except DockerException as e:
self.log.error(
"\nDocker client initialization error: %s.\nCheck if docker is running on the host.\n",
e,
)
self.exit(1)
# If the source to be executed is a directory, continue using the
# directory. In the case of a local directory, it is used as both the
# source and target. Reusing a local directory seems better than
# making a copy of it as it might contain large files that would be
# expensive to copy.
if os.path.isdir(self.repo):
checkout_path = self.repo
else:
if self.git_workdir is None:
checkout_path = tempfile.mkdtemp(prefix="repo2docker")
else:
checkout_path = self.git_workdir
try:
self.fetch(self.repo, self.ref, checkout_path)
if self.find_image():
self.log.info(
"Reusing existing image ({}), not "
"building.".format(self.output_image_spec)
)
# no need to build, so skip to the end by `return`ing here
# this will still execute the finally clause and let's us
# avoid having to indent the build code by an extra level
return
if self.subdir:
checkout_path = os.path.join(checkout_path, self.subdir)
if not os.path.isdir(checkout_path):
self.log.error(
"Subdirectory %s does not exist",
self.subdir,
extra=dict(phase="failure"),
)
raise FileNotFoundError("Could not find {}".format(checkout_path))
with chdir(checkout_path):
for BP in self.buildpacks:
bp = BP()
if bp.detect():
picked_buildpack = bp
break
else:
picked_buildpack = self.default_buildpack()
picked_buildpack.appendix = self.appendix
# Add metadata labels
picked_buildpack.labels["repo2docker.version"] = self.version
repo_label = "local" if os.path.isdir(self.repo) else self.repo
picked_buildpack.labels["repo2docker.repo"] = repo_label
picked_buildpack.labels["repo2docker.ref"] = self.ref
if self.dry_run:
print(picked_buildpack.render())
else:
self.log.debug(
picked_buildpack.render(), extra=dict(phase="building")
)
if self.user_id == 0:
raise ValueError(
"Root as the primary user in the image is not permitted."
)
build_args = {
"NB_USER": self.user_name,
"NB_UID": str(self.user_id),
}
if self.target_repo_dir:
build_args["REPO_DIR"] = self.target_repo_dir
self.log.info(
"Using %s builder\n",
bp.__class__.__name__,
extra=dict(phase="building"),
)
for l in picked_buildpack.build(
docker_client,
self.output_image_spec,
self.build_memory_limit,
build_args,
self.cache_from,
self.extra_build_kwargs,
):
if "stream" in l:
self.log.info(l["stream"], extra=dict(phase="building"))
elif "error" in l:
self.log.info(l["error"], extra=dict(phase="failure"))
raise docker.errors.BuildError(l["error"], build_log="")
elif "status" in l:
self.log.info(
"Fetching base image...\r", extra=dict(phase="building")
)
else:
self.log.info(json.dumps(l), extra=dict(phase="building"))
finally:
# Cleanup checkout if necessary
if self.cleanup_checkout:
shutil.rmtree(checkout_path, ignore_errors=True)
def start(self):
self.build()
if self.push:
self.push_image()
if self.run:
self.run_image()