repo2docker/repo2docker/app.py

583 wiersze
18 KiB
Python

"""repo2docker: convert git repositories into jupyter-suitable docker images
Images produced by repo2docker can be used with Jupyter notebooks standalone
or with BinderHub.
Usage:
python -m repo2docker https://github.com/you/your-repo
"""
import sys
import json
import os
import time
import logging
import argparse
import tempfile
from pythonjsonlogger import jsonlogger
import escapism
import pwd
from traitlets.config import Application
from traitlets import Unicode, List, default, Tuple, Dict, Int
import docker
from docker.utils import kwargs_from_env
import subprocess
from .buildpacks import (
PythonBuildPack, DockerBuildPack, LegacyBinderDockerBuildPack,
CondaBuildPack, JuliaBuildPack, Python2BuildPack, BaseImage
)
from .utils import execute_cmd, ByteSpecification, maybe_cleanup, is_valid_docker_image_name, validate_and_generate_port_mapping
from . import __version__
def compose(buildpacks, parent=None):
"""
Shortcut to compose many buildpacks together
"""
image = buildpacks[0](parent=parent)
for buildpack in buildpacks[1:]:
image = image.compose_with(buildpack(parent=parent))
return image
class Repo2Docker(Application):
name = 'jupyter-repo2docker'
version = __version__
description = __doc__
@default('log_level')
def _default_log_level(self):
return logging.INFO
git_workdir = Unicode(
None,
config=True,
allow_none=True,
help="""
Working directory to check out git repositories to.
The default is to use the system's temporary directory. Should be
somewhere ephemeral, such as /tmp.
"""
)
buildpacks = List(
[
(LegacyBinderDockerBuildPack, ),
(DockerBuildPack, ),
(BaseImage, CondaBuildPack, JuliaBuildPack),
(BaseImage, CondaBuildPack),
(BaseImage, PythonBuildPack, Python2BuildPack, JuliaBuildPack),
(BaseImage, PythonBuildPack, JuliaBuildPack),
(BaseImage, PythonBuildPack, Python2BuildPack),
(BaseImage, PythonBuildPack),
],
config=True,
help="""
Ordered list of BuildPacks to try to use to build a git repository.
"""
)
default_buildpack = Tuple(
(BaseImage, PythonBuildPack),
config=True,
help="""
The build pack to use when no buildpacks are found
"""
)
build_memory_limit = ByteSpecification(
0,
help="""
Total memory that can be used by the docker image building process.
Set to 0 for no limits.
""",
config=True
)
volumes = Dict(
{},
help="""
Volumes to mount when running the container.
Only used when running, not during build!
Should be a key value pair, with the key being the volume source &
value being the destination. Both can be relative - sources are
resolved relative to the current working directory on the host,
destination is resolved relative to the working directory of the image -
($HOME by default)
""",
config=True
)
user_id = Int(
help="""
UID of the user to create inside the built image.
Should be a uid that is not currently used by anything in the image.
Defaults to uid of currently running user, since that is the most
common case when running r2d manually.
Might not affect Dockerfile builds.
""",
config=True
)
@default('user_id')
def _user_id_default(self):
"""
Default user_id to current running user.
"""
return os.geteuid()
user_name = Unicode(
'jovyan',
help="""
Username of the user to create inside the built image.
Should be a username that is not currently used by anything in the image,
and should conform to the restrictions on user names for Linux.
Defaults to username of currently running user, since that is the most
common case when running r2d manually.
""",
config=True
)
@default('user_name')
def _user_name_default(self):
"""
Default user_name to current running user.
"""
return pwd.getpwuid(os.getuid()).pw_name
def fetch(self, url, ref, checkout_path):
try:
for line in execute_cmd(['git', 'clone', url, checkout_path],
capture=self.json_logs):
self.log.info(line, extra=dict(phase='fetching'))
except subprocess.CalledProcessError:
self.log.error('Failed to clone repository!',
extra=dict(phase='failed'))
sys.exit(1)
if ref:
try:
for line in execute_cmd(['git', 'reset', '--hard', ref],
cwd=checkout_path,
capture=self.json_logs):
self.log.info(line, extra=dict(phase='fetching'))
except subprocess.CalledProcessError:
self.log.error('Failed to check out ref %s', ref,
extra=dict(phase='failed'))
sys.exit(1)
def validate_image_name(self, image_name):
"""
Validate image_name read by argparse contains only lowercase characters
Args:
image_name (string): argument read by the argument parser
Returns:
unmodified image_name
Raises:
ArgumentTypeError: if image_name contains characters that are not lowercase
"""
if not is_valid_docker_image_name(image_name):
msg = "%r is not a valid docker image name. Image name can contain only lowercase characters." % image_name
raise argparse.ArgumentTypeError(msg)
return image_name
def get_argparser(self):
argparser = argparse.ArgumentParser()
argparser.add_argument(
'--config',
default='repo2docker_config.py',
help="Path to config file for repo2docker"
)
argparser.add_argument(
'--json-logs',
default=False,
action='store_true',
help='Emit JSON logs instead of human readable logs'
)
argparser.add_argument(
'repo',
help=('Path to repository that should be built. Could be '
'local path or a git URL.')
)
argparser.add_argument(
'--image-name',
help=('Name of image to be built. If unspecified will be '
'autogenerated'),
type=self.validate_image_name
)
argparser.add_argument(
'--ref',
help='If building a git url, which ref to check out'
)
argparser.add_argument(
'--debug',
help="Turn on debug logging",
action='store_true',
)
argparser.add_argument(
'--no-build',
dest='build',
action='store_false',
help=('Do not actually build the image. Useful in conjunction '
'with --debug.')
)
argparser.add_argument(
'--build-memory-limit',
help='Total Memory that can be used by the docker build process'
)
argparser.add_argument(
'cmd',
nargs=argparse.REMAINDER,
help='Custom command to run after building container'
)
argparser.add_argument(
'--no-run',
dest='run',
action='store_false',
help='Do not run container after it has been built'
)
argparser.add_argument(
'--publish', '-p',
dest='ports',
action='append',
help='Specify port mappings for the image. Needs a command to run in the container.'
)
argparser.add_argument(
'--publish-all', '-P',
dest='all_ports',
action='store_true',
help='Publish all exposed ports to random host ports.'
)
argparser.add_argument(
'--no-clean',
dest='clean',
action='store_false',
help="Don't clean up remote checkouts after we are done"
)
argparser.add_argument(
'--push',
dest='push',
action='store_true',
help='Push docker image to repository'
)
argparser.add_argument(
'--volume', '-v',
dest='volumes',
action='append',
help='Volumes to mount inside the container, in form src:dest',
default=[]
)
argparser.add_argument(
'--user-id',
help='User ID of the primary user in the image',
type=int
)
argparser.add_argument(
'--user-name',
help='Username of the primary user in the image',
)
argparser.add_argument(
'--env', '-e',
dest='environment',
action='append',
help='Environment variables to define at container run time',
default=[]
)
return argparser
def json_excepthook(self, etype, evalue, traceback):
"""Called on an uncaught exception when using json logging
Avoids non-JSON output on errors when using --json-logs
"""
self.log.error("Error during build: %s", evalue,
exc_info=(etype, evalue, traceback),
extra=dict(phase='failed'))
def initialize(self):
args = self.get_argparser().parse_args()
if args.debug:
self.log_level = logging.DEBUG
self.load_config_file(args.config)
if os.path.exists(args.repo):
# Let's treat this as a local directory we are building
self.repo_type = 'local'
self.repo = args.repo
self.ref = None
self.cleanup_checkout = False
else:
self.repo_type = 'remote'
self.repo = args.repo
self.ref = args.ref
self.cleanup_checkout = args.clean
if args.json_logs:
# register JSON excepthook to avoid non-JSON output on errors
sys.excepthook = self.json_excepthook
# Need to reset existing handlers, or we repeat messages
logHandler = logging.StreamHandler()
formatter = jsonlogger.JsonFormatter()
logHandler.setFormatter(formatter)
self.log.handlers = []
self.log.addHandler(logHandler)
self.log.setLevel(logging.INFO)
else:
# due to json logger stuff above,
# our log messages include carriage returns, newlines, etc.
# remove the additional newline from the stream handler
self.log.handlers[0].terminator = ''
# We don't want a [Repo2Docker] on all messages
self.log.handlers[0].formatter = logging.Formatter(
fmt='%(message)s'
)
if args.image_name:
self.output_image_spec = args.image_name
else:
# Attempt to set a sane default!
# HACK: Provide something more descriptive?
self.output_image_spec = (
'r2d' +
escapism.escape(self.repo, escape_char='-').lower() +
str(int(time.time()))
)
self.push = args.push
self.run = args.run
self.json_logs = args.json_logs
self.build = args.build
if not self.build:
# Can't push nor run if we aren't building
self.run = False
self.push = False
# check against self.run and not args.run as self.run is false on --no-build
if args.volumes and not self.run:
# Can't mount if we aren't running
print("To Mount volumes with -v, you also need to run the container")
sys.exit(1)
for v in args.volumes:
src, dest = v.split(':')
self.volumes[src] = dest
self.run_cmd = args.cmd
if args.all_ports and not self.run:
print('To publish user defined port mappings, the container must also be run')
sys.exit(1)
if args.ports and not self.run:
print('To publish user defined port mappings, the container must also be run')
sys.exit(1)
if args.ports and not self.run_cmd:
print('To publish user defined port mapping, user must specify the command to run in the container')
sys.exit(1)
self.ports = validate_and_generate_port_mapping(args.ports)
self.all_ports = args.all_ports
if args.user_id:
self.user_id = args.user_id
if args.user_name:
self.user_name = args.user_name
if args.build_memory_limit:
self.build_memory_limit = args.build_memory_limit
if args.environment and not self.run:
print("To specify environment variables, you also need to run the container")
sys.exit(1)
self.environment = args.environment
def push_image(self):
client = docker.APIClient(version='auto', **kwargs_from_env())
# Build a progress setup for each layer, and only emit per-layer
# info every 1.5s
layers = {}
last_emit_time = time.time()
for line in client.push(self.output_image_spec, stream=True):
progress = json.loads(line.decode('utf-8'))
if 'error' in progress:
self.log.error(progress['error'], extra=dict(phase='failed'))
sys.exit(1)
if 'id' not in progress:
continue
if 'progressDetail' in progress and progress['progressDetail']:
layers[progress['id']] = progress['progressDetail']
else:
layers[progress['id']] = progress['status']
if time.time() - last_emit_time > 1.5:
self.log.info('Pushing image\n',
extra=dict(progress=layers, phase='pushing'))
last_emit_time = time.time()
def run_image(self):
client = docker.from_env(version='auto')
if not self.run_cmd:
port = str(self._get_free_port())
run_cmd = ['jupyter', 'notebook', '--ip', '0.0.0.0',
'--port', port]
ports = {'%s/tcp' % port: port}
else:
# run_cmd given by user, if port is also given then pass it on
run_cmd = self.run_cmd
if self.ports:
ports = self.ports
else:
ports = {}
container_volumes = {}
if self.volumes:
api_client = docker.APIClient(
version='auto',
**docker.utils.kwargs_from_env()
)
image = api_client.inspect_image(self.output_image_spec)
image_workdir = image['ContainerConfig']['WorkingDir']
for k, v in self.volumes.items():
container_volumes[os.path.abspath(k)] = {
'bind': v if v.startswith('/') else os.path.join(image_workdir, v),
'mode': 'rw'
}
container = client.containers.run(
self.output_image_spec,
publish_all_ports=self.all_ports,
ports=ports,
detach=True,
command=run_cmd,
volumes=container_volumes,
environment=self.environment
)
while container.status == 'created':
time.sleep(0.5)
container.reload()
try:
for line in container.logs(stream=True):
self.log.info(line.decode('utf-8'),
extra=dict(phase='running'))
finally:
container.reload()
if container.status == 'running':
self.log.info('Stopping container...\n',
extra=dict(phase='running'))
container.kill()
exit_code = container.attrs['State']['ExitCode']
container.remove()
sys.exit(exit_code)
def _get_free_port(self):
"""
Hacky method to get a free random port on local host
"""
import socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.bind(("", 0))
port = s.getsockname()[1]
s.close()
return port
def start(self):
if self.repo_type == 'local':
checkout_path = self.repo
else:
if self.git_workdir is None:
checkout_path = tempfile.mkdtemp(prefix='repo2docker')
else:
checkout_path = self.git_workdir
# keep as much as possible in the context manager to make sure we
# cleanup if things go wrong
with maybe_cleanup(checkout_path, self.cleanup_checkout):
if self.repo_type == 'remote':
self.fetch(
self.repo,
self.ref,
checkout_path
)
os.chdir(checkout_path)
picked_buildpack = compose(self.default_buildpack, parent=self)
for bp_spec in self.buildpacks:
bp = compose(bp_spec, parent=self)
if bp.detect():
picked_buildpack = bp
break
self.log.debug(picked_buildpack.render(),
extra=dict(phase='building'))
if self.build:
build_args = {
'NB_USER': self.user_name,
'NB_UID': str(self.user_id)
}
self.log.info('Using %s builder\n', bp.name,
extra=dict(phase='building'))
for l in picked_buildpack.build(self.output_image_spec, self.build_memory_limit, build_args):
if 'stream' in l:
self.log.info(l['stream'],
extra=dict(phase='building'))
elif 'error' in l:
self.log.info(l['error'], extra=dict(phase='failure'))
sys.exit(1)
elif 'status' in l:
self.log.info('Fetching base image...\r',
extra=dict(phase='building'))
else:
self.log.info(json.dumps(l),
extra=dict(phase='building'))
if self.push:
self.push_image()
if self.run:
self.run_image()