repo2docker/repo2docker/app.py

"""repo2docker: convert git repositories into jupyter-suitable docker images

Images produced by repo2docker can be used with Jupyter notebooks standalone
or with BinderHub.

Usage:

    python -m repo2docker https://github.com/you/your-repo
"""
import json
import sys
import logging
import os
import getpass
import shutil
import tempfile
import time

import docker
from urllib.parse import urlparse
from docker.utils import kwargs_from_env
from docker.errors import DockerException
import escapism
from pythonjsonlogger import jsonlogger

from traitlets import Any, Dict, Int, List, Unicode, Bool, default
from traitlets.config import Application

from . import __version__
from .buildpacks import (
    CondaBuildPack,
    DockerBuildPack,
    JuliaProjectTomlBuildPack,
    JuliaRequireBuildPack,
    LegacyBinderDockerBuildPack,
    NixBuildPack,
    PipfileBuildPack,
    PythonBuildPack,
    RBuildPack,
)
from . import contentproviders
from .utils import ByteSpecification, chdir


class Repo2Docker(Application):
    """An application for converting git repositories to docker images"""

    name = "jupyter-repo2docker"
    version = __version__
    description = __doc__

    @default("log_level")
    def _default_log_level(self):
        """The application's default log level"""
        return logging.INFO

    git_workdir = Unicode(
        None,
        config=True,
        allow_none=True,
        help="""
        Working directory to use for check out of git repositories.

        The default is to use the system's temporary directory. Should be
        somewhere ephemeral, such as /tmp.
        """,
    )

    subdir = Unicode(
        "",
        config=True,
        help="""
        Subdirectory of the git repository to examine.

        Defaults to ''.
        """,
    )

    cache_from = List(
        [],
        config=True,
        help="""
        List of images to try & re-use cached image layers from.

        Docker only tries to re-use image layers from images built locally,
        not pulled from a registry. We can ask it to explicitly re-use layers
        from non-locally built images by through the 'cache_from' parameter.
        """,
    )

    buildpacks = List(
        [
            LegacyBinderDockerBuildPack,
            DockerBuildPack,
            JuliaProjectTomlBuildPack,
            JuliaRequireBuildPack,
            NixBuildPack,
            RBuildPack,
            CondaBuildPack,
            PipfileBuildPack,
            PythonBuildPack,
        ],
        config=True,
        help="""
        Ordered list of BuildPacks to try when building a git repository.
        """,
    )

    extra_build_kwargs = Dict(
        {},
        help="""
        extra kwargs to limit CPU quota when building a docker image.
        Dictionary that allows the user to set the desired runtime flag
        to configure the amount of access to CPU resources your container has.
        Reference https://docs.docker.com/config/containers/resource_constraints/#cpu
        """,
        config=True,
    )

    extra_run_kwargs = Dict(
        {},
        help="""
        extra kwargs to limit CPU quota when running a docker image.
        Dictionary that allows the user to set the desired runtime flag
        to configure the amount of access to CPU resources your container has.
        Reference https://docs.docker.com/config/containers/resource_constraints/#cpu
        """,
        config=True,
    )

    default_buildpack = Any(
        PythonBuildPack,
        config=True,
        help="""
        The default build pack to use when no other buildpacks are found.
        """,
    )

    # Git is our content provider of last resort. This is to maintain the
    # old behaviour when git and local directories were the only supported
    # content providers. We can detect local directories from the path, but
    # detecting if something will successfully `git clone` is very hard if all
    # you can do is look at the path/URL to it.
    content_providers = List(
        [
            contentproviders.Local,
            contentproviders.Zenodo,
            contentproviders.Figshare,
            contentproviders.Dataverse,
            contentproviders.Git,
        ],
        config=True,
        help="""
        Ordered list by priority of ContentProviders to try in turn to fetch
        the contents specified by the user.
        """,
    )

    build_memory_limit = ByteSpecification(
        0,
        help="""
        Total memory that can be used by the docker image building process.

        Set to 0 for no limits.
        """,
        config=True,
    )

    volumes = Dict(
        {},
        help="""
        Volumes to mount when running the container.

        Only used when running, not during build process!

        Use a key-value pair, with the key being the volume source &
        value being the destination volume.

        Both source and destination can be relative. Source is resolved
        relative to the current working directory on the host, and
        destination is resolved relative to the working directory of the
        image - ($HOME by default)
        """,
        config=True,
    )

    user_id = Int(
        help="""
        UID of the user to create inside the built image.

        Should be a uid that is not currently used by anything in the image.
        Defaults to uid of currently running user, since that is the most
        common case when running r2d manually.

        Might not affect Dockerfile builds.
        """,
        config=True,
    )

    @default("user_id")
    def _user_id_default(self):
        """
        Default user_id to current running user.
        """
        return os.geteuid()

    user_name = Unicode(
        "jovyan",
        help="""
        Username of the user to create inside the built image.

        Should be a username that is not currently used by anything in the
        image, and should conform to the restrictions on user names for Linux.

        Defaults to username of currently running user, since that is the most
        common case when running repo2docker manually.
        """,
        config=True,
    )

    @default("user_name")
    def _user_name_default(self):
        """
        Default user_name to current running user.
        """
        return getpass.getuser()

    appendix = Unicode(
        config=True,
        help="""
        Appendix of Dockerfile commands to run at the end of the build.

        Can be used to customize the resulting image after all
        standard build steps finish.
        """,
    )

    json_logs = Bool(
        False,
        help="""
        Log output in structured JSON format.

        Useful when stdout is consumed by other tools
        """,
        config=True,
    )

    repo = Unicode(
        ".",
        help="""
        Specification of repository to build image for.

        Could be local path or git URL.
        """,
        config=True,
    )

    ref = Unicode(
        None,
        help="""
        Git ref that should be built.

        If repo is a git repository, this ref is checked out
        in a local clone before repository is built.
        """,
        config=True,
        allow_none=True,
    )

    cleanup_checkout = Bool(
        False,
        help="""
        Delete source repository after building is done.

        Useful when repo2docker is doing the git cloning
        """,
        config=True,
    )

    output_image_spec = Unicode(
        "",
        help="""
        Docker Image name:tag to tag the built image with.

        Required parameter.
        """,
        config=True,
    )

    push = Bool(
        False,
        help="""
        Set to true to push docker image after building
        """,
        config=True,
    )

    run = Bool(
        False,
        help="""
        Run docker image after building
        """,
        config=True,
    )

    # FIXME: Refactor class to be able to do --no-build without needing
    #        deep support for it inside other code
    dry_run = Bool(
        False,
        help="""
        Do not actually build the docker image, just simulate it.
        """,
        config=True,
    )

    # FIXME: Refactor classes to separate build & run steps
    run_cmd = List(
        [],
        help="""
        Command to run when running the container

        When left empty, a jupyter notebook is run.
        """,
        config=True,
    )

    all_ports = Bool(
        False,
        help="""
        Publish all declared ports from container whiel running.

        Equivalent to -P option to docker run
        """,
        config=True,
    )

    ports = Dict(
        {},
        help="""
        Port mappings to establish when running the container.

        Equivalent to -p {key}:{value} options to docker run.
        {key} refers to port inside container, and {value}
        refers to port / host:port in the host
        """,
        config=True,
    )

    environment = List(
        [],
        help="""
        Environment variables to set when running the built image.

        Each item must be a string formatted as KEY=VALUE
        """,
        config=True,
    )

    target_repo_dir = Unicode(
        "",
        help="""
        Path inside the image where contents of the repositories are copied to,
        and where all the build operations (such as postBuild) happen.

        Defaults to ${HOME} if not set
        """,
        config=True,
    )

    def fetch(self, url, ref, checkout_path):
        """Fetch the contents of `url` and place it in `checkout_path`.

        The `ref` parameter specifies what "version" of the contents should be
        fetched. In the case of a git repository `ref` is the SHA-1 of a commit.

        Iterate through possible content providers until a valid provider,
        based on URL, is found.
        """
        picked_content_provider = None
        for ContentProvider in self.content_providers:
            cp = ContentProvider()
            spec = cp.detect(url, ref=ref)
            if spec is not None:
                picked_content_provider = cp
                self.log.info(
                    "Picked {cp} content "
                    "provider.\n".format(cp=cp.__class__.__name__)
                )
                break

        if picked_content_provider is None:
            self.log.error(
                "No matching content provider found for " "{url}.".format(url=url)
            )

        for log_line in picked_content_provider.fetch(
            spec, checkout_path, yield_output=self.json_logs
        ):
            self.log.info(log_line, extra=dict(phase="fetching"))

        if not self.output_image_spec:
            self.output_image_spec = (
                "r2d" + escapism.escape(self.repo, escape_char="-").lower()
            )
            # if we are building from a subdirectory include that in the
            # image name so we can tell builds from different sub-directories
            # apart.
            if self.subdir:
                self.output_image_spec += escapism.escape(
                    self.subdir, escape_char="-"
                ).lower()
            if picked_content_provider.content_id is not None:
                self.output_image_spec += picked_content_provider.content_id
            else:
                self.output_image_spec += str(int(time.time()))

    def json_excepthook(self, etype, evalue, traceback):
        """Called on an uncaught exception when using json logging

        Avoids non-JSON output on errors when using --json-logs
        """
        self.log.error(
            "Error during build: %s",
            evalue,
            exc_info=(etype, evalue, traceback),
            extra=dict(phase="failed"),
        )

    def initialize(self):
        """Init repo2docker configuration before start"""
        # FIXME: Remove this function, move it to setters / traitlet reactors
        if self.json_logs:
            # register JSON excepthook to avoid non-JSON output on errors
            sys.excepthook = self.json_excepthook
            # Need to reset existing handlers, or we repeat messages
            logHandler = logging.StreamHandler()
            formatter = jsonlogger.JsonFormatter()
            logHandler.setFormatter(formatter)
            self.log = logging.getLogger("repo2docker")
            self.log.handlers = []
            self.log.addHandler(logHandler)
            self.log.setLevel(self.log_level)
        else:
            # due to json logger stuff above,
            # our log messages include carriage returns, newlines, etc.
            # remove the additional newline from the stream handler
            self.log.handlers[0].terminator = ""
            # We don't want a [Repo2Docker] on all messages
            self.log.handlers[0].formatter = logging.Formatter(fmt="%(message)s")

        if self.dry_run and (self.run or self.push):
            raise ValueError("Cannot push or run image if we are not building it")

        if self.volumes and not self.run:
            raise ValueError("Cannot mount volumes if container is not run")

    def push_image(self):
        """Push docker image to registry"""
        client = docker.APIClient(version="auto", **kwargs_from_env())
        # Build a progress setup for each layer, and only emit per-layer
        # info every 1.5s
        progress_layers = {}
        layers = {}
        last_emit_time = time.time()
        for chunk in client.push(self.output_image_spec, stream=True):
            # each chunk can be one or more lines of json events
            # split lines here in case multiple are delivered at once
            for line in chunk.splitlines():
                line = line.decode("utf-8", errors="replace")
                try:
                    progress = json.loads(line)
                except Exception as e:
                    self.log.warning("Not a JSON progress line: %r", line)
                    continue
                if "error" in progress:
                    self.log.error(progress["error"], extra=dict(phase="failed"))
                    raise docker.errors.ImageLoadError(progress["error"])
                if "id" not in progress:
                    continue
                # deprecated truncated-progress data
                if "progressDetail" in progress and progress["progressDetail"]:
                    progress_layers[progress["id"]] = progress["progressDetail"]
                else:
                    progress_layers[progress["id"]] = progress["status"]
                # include full progress data for each layer in 'layers' data
                layers[progress["id"]] = progress
                if time.time() - last_emit_time > 1.5:
                    self.log.info(
                        "Pushing image\n",
                        extra=dict(
                            progress=progress_layers, layers=layers, phase="pushing"
                        ),
                    )
                    last_emit_time = time.time()
        self.log.info(
            "Successfully pushed {}".format(self.output_image_spec),
            extra=dict(phase="pushing"),
        )

    def run_image(self):
        """Run docker container from built image

        and wait for it to finish.
        """
        container = self.start_container()
        self.wait_for_container(container)

    def start_container(self):
        """Start docker container from built image

        Returns running container
        """
        client = docker.from_env(version="auto")

        docker_host = os.environ.get("DOCKER_HOST")
        if docker_host:
            host_name = urlparse(docker_host).hostname
        else:
            host_name = "127.0.0.1"
        self.hostname = host_name

        if not self.run_cmd:
            port = str(self._get_free_port())
            self.port = port
            # To use the option --NotebookApp.custom_display_url
            # make sure the base-notebook image is updated:
            # docker pull jupyter/base-notebook
            run_cmd = [
                "jupyter",
                "notebook",
                "--ip",
                "0.0.0.0",
                "--port",
                port,
                "--NotebookApp.custom_display_url=http://{}:{}".format(host_name, port),
            ]
            ports = {"%s/tcp" % port: port}
        else:
            # run_cmd given by user, if port is also given then pass it on
            run_cmd = self.run_cmd
            if self.ports:
                ports = self.ports
            else:
                ports = {}
        # store ports on self so they can be retrieved in tests
        self.ports = ports

        container_volumes = {}
        if self.volumes:
            api_client = docker.APIClient(
                version="auto", **docker.utils.kwargs_from_env()
            )
            image = api_client.inspect_image(self.output_image_spec)
            image_workdir = image["ContainerConfig"]["WorkingDir"]

            for k, v in self.volumes.items():
                container_volumes[os.path.abspath(k)] = {
                    "bind": v if v.startswith("/") else os.path.join(image_workdir, v),
                    "mode": "rw",
                }

        run_kwargs = dict(
            publish_all_ports=self.all_ports,
            ports=ports,
            detach=True,
            command=run_cmd,
            volumes=container_volumes,
            environment=self.environment,
        )

        run_kwargs.update(self.extra_run_kwargs)

        container = client.containers.run(self.output_image_spec, **run_kwargs)

        while container.status == "created":
            time.sleep(0.5)
            container.reload()

        return container

    def wait_for_container(self, container):
        """Wait for a container to finish

        Displaying logs while it's running
        """

        try:
            for line in container.logs(stream=True):
                self.log.info(line.decode("utf-8"), extra=dict(phase="running"))
        finally:
            container.reload()
            if container.status == "running":
                self.log.info("Stopping container...\n", extra=dict(phase="running"))
                container.kill()
            exit_code = container.attrs["State"]["ExitCode"]
            container.remove()
            if exit_code:
                sys.exit(exit_code)

    def _get_free_port(self):
        """
        Hacky method to get a free random port on local host
        """
        import socket

        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.bind(("", 0))
        port = s.getsockname()[1]
        s.close()
        return port

    def find_image(self):
        # if this is a dry run it is Ok for dockerd to be unreachable so we
        # always return False for dry runs.
        if self.dry_run:
            return False
        # check if we already have an image for this content
        client = docker.APIClient(version="auto", **kwargs_from_env())
        for image in client.images():
            if image["RepoTags"] is not None:
                for tag in image["RepoTags"]:
                    if tag == self.output_image_spec + ":latest":
                        return True
        return False

    def build(self):
        """
        Build docker image
        """
        # Check if r2d can connect to docker daemon
        if not self.dry_run:
            try:
                docker_client = docker.APIClient(version="auto", **kwargs_from_env())
            except DockerException as e:
                self.log.error(
                    "\nDocker client initialization error: %s.\nCheck if docker is running on the host.\n",
                    e,
                )
                self.exit(1)

        # If the source to be executed is a directory, continue using the
        # directory. In the case of a local directory, it is used as both the
        # source and target. Reusing a local directory seems better than
        # making a copy of it as it might contain large files that would be
        # expensive to copy.
        if os.path.isdir(self.repo):
            checkout_path = self.repo
        else:
            if self.git_workdir is None:
                checkout_path = tempfile.mkdtemp(prefix="repo2docker")
            else:
                checkout_path = self.git_workdir

        try:
            self.fetch(self.repo, self.ref, checkout_path)

            if self.find_image():
                self.log.info(
                    "Reusing existing image ({}), not "
                    "building.".format(self.output_image_spec)
                )
                # no need to build, so skip to the end by `return`ing here
                # this will still execute the finally clause and let's us
                # avoid having to indent the build code by an extra level
                return

            if self.subdir:
                checkout_path = os.path.join(checkout_path, self.subdir)
                if not os.path.isdir(checkout_path):
                    self.log.error(
                        "Subdirectory %s does not exist",
                        self.subdir,
                        extra=dict(phase="failure"),
                    )
                    raise FileNotFoundError("Could not find {}".format(checkout_path))

            with chdir(checkout_path):
                for BP in self.buildpacks:
                    bp = BP()
                    if bp.detect():
                        picked_buildpack = bp
                        break
                else:
                    picked_buildpack = self.default_buildpack()

                picked_buildpack.appendix = self.appendix
                # Add metadata labels
                picked_buildpack.labels["repo2docker.version"] = self.version
                repo_label = "local" if os.path.isdir(self.repo) else self.repo
                picked_buildpack.labels["repo2docker.repo"] = repo_label
                picked_buildpack.labels["repo2docker.ref"] = self.ref

                if self.dry_run:
                    print(picked_buildpack.render())
                else:
                    self.log.debug(
                        picked_buildpack.render(), extra=dict(phase="building")
                    )
                    if self.user_id == 0:
                        raise ValueError(
                            "Root as the primary user in the image is not permitted."
                        )

                    build_args = {
                        "NB_USER": self.user_name,
                        "NB_UID": str(self.user_id),
                    }
                    if self.target_repo_dir:
                        build_args["REPO_DIR"] = self.target_repo_dir
                    self.log.info(
                        "Using %s builder\n",
                        bp.__class__.__name__,
                        extra=dict(phase="building"),
                    )

                    for l in picked_buildpack.build(
                        docker_client,
                        self.output_image_spec,
                        self.build_memory_limit,
                        build_args,
                        self.cache_from,
                        self.extra_build_kwargs,
                    ):
                        if "stream" in l:
                            self.log.info(l["stream"], extra=dict(phase="building"))
                        elif "error" in l:
                            self.log.info(l["error"], extra=dict(phase="failure"))
                            raise docker.errors.BuildError(l["error"], build_log="")
                        elif "status" in l:
                            self.log.info(
                                "Fetching base image...\r", extra=dict(phase="building")
                            )
                        else:
                            self.log.info(json.dumps(l), extra=dict(phase="building"))

        finally:
            # Cleanup checkout if necessary
            if self.cleanup_checkout:
                shutil.rmtree(checkout_path, ignore_errors=True)

    def start(self):
        self.build()

        if self.push:
            self.push_image()

        if self.run:
            self.run_image()