repo2docker/repo2docker/utils.py

import os
import platform
import re
import subprocess
import warnings
from contextlib import contextmanager
from enum import Enum
from functools import partial
from shutil import copy2, copystat

import chardet
from traitlets import Integer, TraitError


class R2dState(Enum):
    """
    The current state of repo2docker
    """

    FETCHING = "fetching"
    BUILDING = "building"
    PUSHING = "pushing"
    RUNNING = "running"
    FAILED = "failed"

    def __str__(self):
        return self.value


def execute_cmd(cmd, capture=False, **kwargs):
    """
    Call given command, yielding output line by line if capture=True.

    Must be yielded from.
    """
    if capture:
        kwargs["stdout"] = subprocess.PIPE
        kwargs["stderr"] = subprocess.STDOUT

    proc = subprocess.Popen(cmd, **kwargs)

    if not capture:
        # not capturing output, let subprocesses talk directly to terminal
        ret = proc.wait()
        if ret != 0:
            raise subprocess.CalledProcessError(ret, cmd)
        return

    # Capture output for logging.
    # Each line will be yielded as text.
    # This should behave the same as .readline(), but splits on `\r` OR `\n`,
    # not just `\n`.
    buf = []

    def flush():
        """Flush next line of the buffer"""
        line = b"".join(buf).decode("utf8", "replace")
        buf[:] = []
        return line

    c_last = ""
    try:
        for c in iter(partial(proc.stdout.read, 1), b""):
            if c_last == b"\r" and buf and c != b"\n":
                yield flush()
            buf.append(c)
            if c == b"\n":
                yield flush()
            c_last = c
        if buf:
            yield flush()
    finally:
        ret = proc.wait()
        if ret != 0:
            raise subprocess.CalledProcessError(ret, cmd)


@contextmanager
def chdir(path):
    """Change working directory to `path` and restore it again

    This context manager is useful if `path` stops existing during your
    operations.
    """
    old_dir = os.getcwd()
    os.chdir(path)
    try:
        yield
    finally:
        os.chdir(old_dir)


@contextmanager
def open_guess_encoding(path):
    """
    Open a file in text mode, specifying its encoding,
    that we guess using chardet.
    """
    detector = chardet.universaldetector.UniversalDetector()
    with open(path, "rb") as f:
        for line in f.readlines():
            detector.feed(line)
            if detector.done:
                break
    detector.close()

    file = open(path, encoding=detector.result["encoding"])
    try:
        yield file
    finally:
        file.close()


def validate_and_generate_port_mapping(port_mappings):
    """
    Validate a list of port mappings and return a dictionary of port mappings.

    Args:
        port_mappings (list): List of strings of format
            `'host_port:container_port'` with optional tcp udp values and host
            network interface

    Returns:
        Dictionary of port mappings in the format accepted by docker-py's
        `containers.run()` method (https://docker-py.readthedocs.io/en/stable/containers.html)

    Raises:
        Exception on invalid port mapping

    Note:
        One limitation of repo2docker is it cannot bind a
        single container_port to multiple host_ports
        (docker-py supports this but repo2docker does not)
    """

    def check_port(port):
        try:
            p = int(port)
        except ValueError as e:
            raise ValueError(f'Port specification "{mapping}" has an invalid port.')
        if not 0 < p <= 65535:
            raise ValueError(
                f'Port specification "{mapping}" specifies a port outside 1-65535.'
            )
        return port

    def check_port_string(p):
        parts = p.split("/")
        if len(parts) == 2:  # 134/tcp
            port, protocol = parts
            if protocol not in ("tcp", "udp"):
                raise ValueError(
                    f'Port specification "{mapping}" has an invalid protocol.'
                )
        elif len(parts) == 1:
            port = parts[0]
            protocol = "tcp"

        check_port(port)

        return "/".join((port, protocol))

    ports = {}
    if port_mappings is None:
        return ports

    for mapping in port_mappings:
        if ":" in mapping:
            parts = mapping.split(":")
        else:
            # single port '8888' specified,
            # treat as '8888:8888'
            parts = [mapping, mapping]

        *host, container_port = parts
        # just a port
        if len(host) == 1:
            host = check_port(host[0])
        else:
            host = tuple((host[0], check_port(host[1])))

        container_port = check_port_string(container_port)
        ports[container_port] = host

    return ports


def is_valid_docker_image_name(image_name):
    """
    Determine if image name is valid for docker using strict pattern.

    Function that constructs a regex representing the docker image name and
    tests it against the given image_name. Reference Regex definition in
    https://github.com/docker/distribution/blob/master/reference/regexp.go
    The definition uses a stricter pattern than the docker default.

    Args:
        image_name: string representing a docker image name

    Returns:
        True if image_name is valid, else False

    Example:
        'test.Com/name:latest' is a valid tag

        'Test/name:latest' is not a valid tag

    Note:
        This function has a stricter pattern than
        https://github.com/docker/distribution/blob/master/reference/regexp.go

        This pattern will not allow cases like `TEST.com/name:latest` though
        docker considers it a valid tag.
    """
    reference_regex = re.compile(
        r"""
        ^  # Anchored at start and end of string

        (  # Start capturing name

        (?:  # start grouping the optional registry domain name part

        (?:[a-z0-9]|[a-z0-9][a-z0-9-]*[a-z0-9])  # lowercase only '<domain-name-component>'

        (?:  # start optional group

        # multiple repetitions of pattern '.<domain-name-component>'
        (?:\.(?:[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9]))+

        )?  # end optional grouping part of the '.' separated domain name

        (?::[0-9]+)?/  # '<domain-name>' followed by an optional '<port>' component followed by '/' literal

        )?  # end grouping the optional registry domain part

        # start <name-pattern>
        [a-z0-9]+   # must have a <name-component>
        (?:
        (?:(?:[\._]|__|[-]*)[a-z0-9]+)+  # repeat the pattern '<separator><name-component>'
        )?  # optionally have multiple repetitions of the above line
        # end <name-pattern>

        (?:  # start optional name components

        (?:  # start multiple repetitions

        /   # separate multiple name components by /
        # start <name-pattern>
        [a-z0-9]+                        # must have a <name-component>
        (?:
        (?:(?:[\._]|__|[-]*)[a-z0-9]+)+  # repeat the pattern '<separator><name-component>'
        )?                               # optionally have multiple repetitions of the above line
        # end <name-pattern>

        )+  # multiple repetitions of the pattern '/<name-component><separator><name-component>'

        )?  # optionally have the above group

        )   # end capturing name

        (?::([\w][\w.-]{0,127}))?    # optional capture <tag-pattern>=':<tag>'
        # optionally capture <digest-pattern>='@<digest>'
        (?:@[A-Za-z][A-Za-z0-9]*(?:[-_+.][A-Za-z][A-Za-z0-9]*)*[:][A-Fa-f0-9]{32,})?
        $
        """,
        re.VERBOSE,
    )

    return reference_regex.match(image_name) is not None


class ByteSpecification(Integer):
    """
    Allow easily specifying bytes in units of 1024 with suffixes

    Suffixes allowed are:
      - K -> Kilobyte
      - M -> Megabyte
      - G -> Gigabyte
      - T -> Terabyte

    Stolen from JupyterHub
    """

    UNIT_SUFFIXES = {
        "K": 1024,
        "M": 1024 * 1024,
        "G": 1024 * 1024 * 1024,
        "T": 1024 * 1024 * 1024 * 1024,
    }

    # Default to allowing None as a value
    allow_none = True

    def validate(self, obj, value):
        """
        Validate that the passed-in value is a valid memory specification

        If value is a pure int, it is taken as a byte value.
        If value has one of the unit suffixes, it is converted into the
        appropriate pure byte value.
        """
        if isinstance(value, (int, float)):
            return int(value)

        try:
            num = float(value[:-1])
        except ValueError:
            raise TraitError(
                f"{value} is not a valid memory specification. "
                "Must be an int or a string with suffix K, M, G, T"
            )
        suffix = value[-1]
        if suffix not in self.UNIT_SUFFIXES:
            raise TraitError(
                f"{value} is not a valid memory specification. "
                "Must be an int or a string with suffix K, M, G, T"
            )
        else:
            return int(float(num) * self.UNIT_SUFFIXES[suffix])


def check_ref(ref, cwd=None):
    """Prepare a ref and ensure it works with git reset --hard."""
    # Try original ref, then trying a remote ref, then removing remote
    refs = [
        ref,  # Original ref
        "/".join(["origin", ref]),  # In case its a remote branch
        ref.split("/")[-1],
    ]  # In case partial commit w/ remote

    hash = None
    for i_ref in refs:
        call = ["git", "rev-parse", "--quiet", i_ref]
        try:
            # If success, output will be <hash>
            response = subprocess.check_output(call, stderr=subprocess.DEVNULL, cwd=cwd)
            hash = response.decode().strip()
        except Exception:
            # We'll throw an error later if no refs resolve
            pass
    return hash


class Error(OSError):
    pass


# a copy of shutil.copytree() that is ok with the target directory
# already existing
def copytree(
    src,
    dst,
    symlinks=False,
    ignore=None,
    copy_function=copy2,
    ignore_dangling_symlinks=False,
):
    """Recursively copy a directory tree.
    The destination directory must not already exist.
    If exception(s) occur, an Error is raised with a list of reasons.
    If the optional symlinks flag is true, symbolic links in the
    source tree result in symbolic links in the destination tree; if
    it is false, the contents of the files pointed to by symbolic
    links are copied. If the file pointed by the symlink doesn't
    exist, an exception will be added in the list of errors raised in
    an Error exception at the end of the copy process.
    You can set the optional ignore_dangling_symlinks flag to true if you
    want to silence this exception. Notice that this has no effect on
    platforms that don't support os.symlink.
    The optional ignore argument is a callable. If given, it
    is called with the `src` parameter, which is the directory
    being visited by copytree(), and `names` which is the list of
    `src` contents, as returned by os.listdir():
        callable(src, names) -> ignored_names
    Since copytree() is called recursively, the callable will be
    called once for each directory that is copied. It returns a
    list of names relative to the `src` directory that should
    not be copied.
    The optional copy_function argument is a callable that will be used
    to copy each file. It will be called with the source path and the
    destination path as arguments. By default, copy2() is used, but any
    function that supports the same signature (like copy()) can be used.
    """
    names = os.listdir(src)
    if ignore is not None:
        ignored_names = ignore(src, names)
    else:
        ignored_names = set()

    os.makedirs(dst, exist_ok=True)
    errors = []
    for name in names:
        if name in ignored_names:
            continue
        srcname = os.path.join(src, name)
        dstname = os.path.join(dst, name)
        try:
            if os.path.islink(srcname):
                linkto = os.readlink(srcname)
                if symlinks:
                    # We can't just leave it to `copy_function` because legacy
                    # code with a custom `copy_function` may rely on copytree
                    # doing the right thing.
                    os.symlink(linkto, dstname)
                    copystat(srcname, dstname, follow_symlinks=not symlinks)
                else:
                    # ignore dangling symlink if the flag is on
                    if not os.path.exists(linkto) and ignore_dangling_symlinks:
                        continue
                    # otherwise let the copy occurs. copy2 will raise an error
                    if os.path.isdir(srcname):
                        copytree(srcname, dstname, symlinks, ignore, copy_function)
                    else:
                        copy_function(srcname, dstname)
            elif os.path.isdir(srcname):
                copytree(srcname, dstname, symlinks, ignore, copy_function)
            else:
                # Will raise a SpecialFileError for unsupported file types
                copy_function(srcname, dstname)
        # catch the Error from the recursive copytree so that we can
        # continue with other files
        except Error as err:
            errors.extend(err.args[0])
        except OSError as why:
            errors.append((srcname, dstname, str(why)))
    try:
        copystat(src, dst)
    except OSError as why:
        # Copying file access times may fail on Windows
        if getattr(why, "winerror", None) is None:
            errors.append((src, dst, str(why)))
    if errors:
        raise Error(errors)
    return dst


def deep_get(dikt, path):
    """Get a value located in `path` from a nested dictionary.

    Use a string separated by periods as the path to access
    values in a nested dictionary:

    deep_get(data, "data.files.0") == data["data"]["files"][0]
    """
    value = dikt
    for component in path.split("."):
        if component.isdigit():
            value = value[int(component)]
        else:
            value = value[component]
    return value


# doi_regexp, is_doi, and normalize_doi are from idutils (https://github.com/inveniosoftware/idutils)
# Copyright (C) 2015-2018 CERN.
# Copyright (C) 2018 Alan Rubin.
# Licensed under BSD-3-Clause license
doi_regexp = re.compile(
    r"(doi:\s*|(?:https?://)?(?:dx\.)?doi\.org/)?(10\.\d+(.\d+)*/.+)$", flags=re.I
)


def is_doi(val):
    """Returns None if val doesn't match pattern of a DOI.
    http://en.wikipedia.org/wiki/Digital_object_identifier."""
    return doi_regexp.match(val)


def normalize_doi(val):
    """Return just the DOI (e.g. 10.1234/jshd123)
    from a val that could include a url or doi
    (e.g. https://doi.org/10.1234/jshd123)"""
    m = doi_regexp.match(val)
    return m.group(2)


def is_local_pip_requirement(line):
    """Return whether a pip requirement (e.g. in requirements.txt file) references a local file"""
    # trim comments and skip empty lines
    line = line.split("#", 1)[0].strip()
    if not line:
        return False

    if line.startswith(("-r", "-c")):
        # local -r or -c references break isolation
        return True

    if line.startswith(("--requirement", "--constraint")):
        # as above but flags are spelt out
        return True

    # the `--pre` flag is a global flag and should appear on a line by itself
    # we just care that this isn't a "local pip requirement"
    if line.startswith("--pre"):
        return False

    # strip off things like `--editable=`. Long form arguments require a =
    # if there is no = it is probably because the line contains
    # a syntax error or our "parser" is too simplistic
    if line.startswith("--") and "=" in line:
        _, line = line.split("=", 1)

    # strip off short form arguments like `-e`. Short form arguments can be
    # followed by a space `-e foo` or use `-e=foo`. The latter is not handled
    # here. We can deal with it when we see someone using it.
    if line.startswith("-"):
        _, *rest = line.split(None, 1)
        if not rest:
            # no argument after `--flag`, skip line
            return False
        line = rest[0]

    if "file://" in line:
        # file references break isolation
        return True

    if "://" in line:
        # handle git://../local/file
        path = line.split("://", 1)[1]
    else:
        path = line

    if path.startswith("."):
        # references a local file
        return True

    return False


def get_platform():
    """Return the target platform of the container image

    Returns either `linux/amd64` or `linux/arm64`
    """
    m = platform.machine()
    if m == "x86_64":
        return "linux/amd64"
    elif m == "aarch64":
        # Linux reports aarch64
        return "linux/arm64"
    elif m == "arm64":
        # OSX reports arm64
        return "linux/arm64"
    else:
        warnings.warn(f"Unexpected platform '{m}', defaulting to linux/amd64")
        return "linux/amd64"