From 39c2ae276b3016d8c07c847e09fc1efbd882ec99 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Sun, 4 Nov 2018 13:23:04 +0100 Subject: [PATCH 1/2] Start reusing existing docker images if content hasn't changed Content providers can specify a "content ID" which is used to identify versions of the content. The ID is used in the docker image name and if we find an existing image for a given content ID the build step is skipped. --- repo2docker/app.py | 126 ++++++++++++++++----------- repo2docker/contentproviders/base.py | 18 ++++ repo2docker/contentproviders/git.py | 12 +++ 3 files changed, 103 insertions(+), 53 deletions(-) diff --git a/repo2docker/app.py b/repo2docker/app.py index 17f1b39a..7fab7b4e 100644 --- a/repo2docker/app.py +++ b/repo2docker/app.py @@ -13,7 +13,6 @@ import sys import logging import os import pwd -import subprocess import tempfile import time @@ -215,6 +214,15 @@ class Repo2Docker(Application): spec, checkout_path, yield_output=self.json_logs): self.log.info(log_line, extra=dict(phase='fetching')) + self.output_image_spec = ( + 'r2d' + + escapism.escape(self.repo, escape_char='-').lower() + ) + if picked_content_provider.content_id is not None: + self.output_image_spec += picked_content_provider.content_id + else: + self.output_image_spec += str(int(time.time())) + def validate_image_name(self, image_name): """ Validate image_name read by argparse @@ -471,13 +479,8 @@ class Repo2Docker(Application): if args.image_name: self.output_image_spec = args.image_name else: - # Attempt to set a sane default! - # HACK: Provide something more descriptive? - self.output_image_spec = ( - 'r2d' + - escapism.escape(self.repo, escape_char='-').lower() + - str(int(time.time())) - ) + # we will pick a name after fetching the repository + self.output_image_spec = None self.push = args.push self.run = args.run @@ -669,14 +672,68 @@ class Repo2Docker(Application): s.close() return port + def find_image(self): + # check if we already have an image for this content + client = docker.APIClient(version='auto', **kwargs_from_env()) + for image in client.images(): + if image['RepoTags'] is not None: + for tags in image['RepoTags']: + if tags == self.output_image_spec + ":latest": + return True + + return False + + def _build_image(self, checkout_path): + if self.subdir: + checkout_path = os.path.join(checkout_path, self.subdir) + if not os.path.isdir(checkout_path): + self.log.error('Subdirectory %s does not exist', + self.subdir, extra=dict(phase='failure')) + sys.exit(1) + + with chdir(checkout_path): + for BP in self.buildpacks: + bp = BP() + if bp.detect(): + picked_buildpack = bp + break + else: + picked_buildpack = self.default_buildpack() + + picked_buildpack.appendix = self.appendix + + self.log.debug(picked_buildpack.render(), + extra=dict(phase='building')) + + if self.build: + build_args = { + 'NB_USER': self.user_name, + 'NB_UID': str(self.user_id) + } + self.log.info('Using %s builder\n', bp.__class__.__name__, + extra=dict(phase='building')) + + for l in picked_buildpack.build(self.output_image_spec, + self.build_memory_limit, build_args): + if 'stream' in l: + self.log.info(l['stream'], + extra=dict(phase='building')) + elif 'error' in l: + self.log.info(l['error'], extra=dict(phase='failure')) + sys.exit(1) + elif 'status' in l: + self.log.info('Fetching base image...\r', + extra=dict(phase='building')) + else: + self.log.info(json.dumps(l), + extra=dict(phase='building')) + def start(self): """Start execution of repo2docker""" # Check if r2d can connect to docker daemon if self.build: try: - client = docker.APIClient(version='auto', - **kwargs_from_env()) - del client + docker.APIClient(version='auto', **kwargs_from_env()) except DockerException as e: print("Docker client initialization error. Check if docker is" " running on the host.") @@ -703,49 +760,12 @@ class Repo2Docker(Application): with maybe_cleanup(checkout_path, self.cleanup_checkout): self.fetch(self.repo, self.ref, checkout_path) - if self.subdir: - checkout_path = os.path.join(checkout_path, self.subdir) - if not os.path.isdir(checkout_path): - self.log.error('Subdirectory %s does not exist', - self.subdir, extra=dict(phase='failure')) - sys.exit(1) + if self.find_image(): + self.log.info("Reusing existing image ({}), not " + "building.".format(self.output_image_spec)) - with chdir(checkout_path): - for BP in self.buildpacks: - bp = BP() - if bp.detect(): - picked_buildpack = bp - break - else: - picked_buildpack = self.default_buildpack() - - picked_buildpack.appendix = self.appendix - - self.log.debug(picked_buildpack.render(), - extra=dict(phase='building')) - - if self.build: - build_args = { - 'NB_USER': self.user_name, - 'NB_UID': str(self.user_id) - } - self.log.info('Using %s builder\n', bp.__class__.__name__, - extra=dict(phase='building')) - - for l in picked_buildpack.build(self.output_image_spec, - self.build_memory_limit, build_args): - if 'stream' in l: - self.log.info(l['stream'], - extra=dict(phase='building')) - elif 'error' in l: - self.log.info(l['error'], extra=dict(phase='failure')) - sys.exit(1) - elif 'status' in l: - self.log.info('Fetching base image...\r', - extra=dict(phase='building')) - else: - self.log.info(json.dumps(l), - extra=dict(phase='building')) + else: + self._build_image(checkout_path) if self.push: self.push_image() diff --git a/repo2docker/contentproviders/base.py b/repo2docker/contentproviders/base.py index 9b986325..7b1718a8 100644 --- a/repo2docker/contentproviders/base.py +++ b/repo2docker/contentproviders/base.py @@ -17,6 +17,24 @@ class ContentProvider: def __init__(self): self.log = logging.getLogger("repo2docker") + @property + def content_id(self): + """A unique ID to represent the version of the content. + + This ID is used to name the built images. If the ID is the same between + two runs of repo2docker we will reuse an existing image (if it exists). + + By providing an ID that summarizes the content we can reuse existing + images and speed up build times. A good ID is the revision of a Git + repository or a hash computed from all the content. + + The type content ID can be any string. + + To disable this behaviour set this property to `None` in which case + a fresh image will always be built. + """ + return None + def detect(self, repo, ref=None, extra_args=None): """Determine compatibility between source and this provider. diff --git a/repo2docker/contentproviders/git.py b/repo2docker/contentproviders/git.py index 29652f8c..dddef50e 100644 --- a/repo2docker/contentproviders/git.py +++ b/repo2docker/contentproviders/git.py @@ -44,3 +44,15 @@ class Git(ContentProvider): cwd=output_dir, capture=yield_output): yield line + + cmd = ['git', 'rev-parse', 'HEAD'] + sha1 = subprocess.Popen(cmd, stdout=subprocess.PIPE, cwd=output_dir) + self._sha1 = sha1.stdout.read().decode().strip() + + @property + def content_id(self): + """A unique ID to represent the version of the content. + + Uses the git commit ID of the repository. + """ + return self._sha1 From b9e182791e8946857c217b41a350f8435bfeb6fc Mon Sep 17 00:00:00 2001 From: Tim Head Date: Thu, 29 Nov 2018 18:22:05 +0100 Subject: [PATCH 2/2] Use a subset of the git SHA1 to make image names unique --- repo2docker/contentproviders/git.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/repo2docker/contentproviders/git.py b/repo2docker/contentproviders/git.py index dddef50e..4f67b2cf 100644 --- a/repo2docker/contentproviders/git.py +++ b/repo2docker/contentproviders/git.py @@ -53,6 +53,6 @@ class Git(ContentProvider): def content_id(self): """A unique ID to represent the version of the content. - Uses the git commit ID of the repository. + Uses the first seven characters of the git commit ID of the repository. """ - return self._sha1 + return self._sha1[:7]