From 3752fa18688570f552479501e1311f139ee6ae12 Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Wed, 28 Aug 2019 16:25:12 +0300 Subject: [PATCH 1/5] Added function to handle different file encodings --- repo2docker/buildpacks/conda/__init__.py | 4 ++-- repo2docker/buildpacks/python/__init__.py | 4 ++-- repo2docker/utils.py | 17 +++++++++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/repo2docker/buildpacks/conda/__init__.py b/repo2docker/buildpacks/conda/__init__.py index c781f00a..4247b945 100644 --- a/repo2docker/buildpacks/conda/__init__.py +++ b/repo2docker/buildpacks/conda/__init__.py @@ -6,7 +6,7 @@ from collections import Mapping from ruamel.yaml import YAML from ..base import BaseImage -from ...utils import is_local_pip_requirement +from ...utils import is_local_pip_requirement, open_utf8convert_read # pattern for parsing conda dependency line PYTHON_REGEX = re.compile(r"python\s*=+\s*([\d\.]*)") @@ -140,7 +140,7 @@ class CondaBuildPack(BaseImage): self._environment_yaml = {} return self._environment_yaml - with open(environment_yml) as f: + with open_utf8convert_read(environment_yml) as f: env = YAML().load(f) # check if the env file is empty, if so instantiate an empty dictionary. if env is None: diff --git a/repo2docker/buildpacks/python/__init__.py b/repo2docker/buildpacks/python/__init__.py index 72029d78..c8d1aaeb 100644 --- a/repo2docker/buildpacks/python/__init__.py +++ b/repo2docker/buildpacks/python/__init__.py @@ -2,7 +2,7 @@ import os from ..conda import CondaBuildPack -from ...utils import is_local_pip_requirement +from ...utils import is_local_pip_requirement, open_utf8convert_read class PythonBuildPack(CondaBuildPack): @@ -86,7 +86,7 @@ class PythonBuildPack(CondaBuildPack): requirements_txt = self.binder_path(name) if not os.path.exists(requirements_txt): continue - with open(requirements_txt) as f: + with open_utf8convert_read(requirements_txt) as f: for line in f: if is_local_pip_requirement(line): return False diff --git a/repo2docker/utils.py b/repo2docker/utils.py index 2b1ae373..b17e1691 100644 --- a/repo2docker/utils.py +++ b/repo2docker/utils.py @@ -3,6 +3,7 @@ from functools import partial import os import re import subprocess +import chardet from shutil import copystat, copy2 @@ -69,6 +70,22 @@ def chdir(path): finally: os.chdir(old_dir) +@contextmanager +def open_utf8convert_read(path): + with open(path, "rb") as f: + file_to_encode = f.read() + + encoding_detection_result = chardet.detect(file_to_encode) + if not "utf-8" in encoding_detection_result: + with open(path, "wb") as f: + f.write(file_to_encode.decode(encoding_detection_result["encoding"]).encode("utf-8")) + + file = open(path) + try: + yield file + finally: + file.close() + def validate_and_generate_port_mapping(port_mappings): """ From 95fd0e5b814cdb049109c13942b458feb6dd31aa Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Thu, 29 Aug 2019 15:13:33 +0300 Subject: [PATCH 2/5] Don't change file encoding --- repo2docker/buildpacks/conda/__init__.py | 4 ++-- repo2docker/buildpacks/python/__init__.py | 4 ++-- repo2docker/utils.py | 22 ++++++++++++++-------- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/repo2docker/buildpacks/conda/__init__.py b/repo2docker/buildpacks/conda/__init__.py index 4247b945..c781f00a 100644 --- a/repo2docker/buildpacks/conda/__init__.py +++ b/repo2docker/buildpacks/conda/__init__.py @@ -6,7 +6,7 @@ from collections import Mapping from ruamel.yaml import YAML from ..base import BaseImage -from ...utils import is_local_pip_requirement, open_utf8convert_read +from ...utils import is_local_pip_requirement # pattern for parsing conda dependency line PYTHON_REGEX = re.compile(r"python\s*=+\s*([\d\.]*)") @@ -140,7 +140,7 @@ class CondaBuildPack(BaseImage): self._environment_yaml = {} return self._environment_yaml - with open_utf8convert_read(environment_yml) as f: + with open(environment_yml) as f: env = YAML().load(f) # check if the env file is empty, if so instantiate an empty dictionary. if env is None: diff --git a/repo2docker/buildpacks/python/__init__.py b/repo2docker/buildpacks/python/__init__.py index c8d1aaeb..d53e06fc 100644 --- a/repo2docker/buildpacks/python/__init__.py +++ b/repo2docker/buildpacks/python/__init__.py @@ -2,7 +2,7 @@ import os from ..conda import CondaBuildPack -from ...utils import is_local_pip_requirement, open_utf8convert_read +from ...utils import is_local_pip_requirement, open_guess_encoding class PythonBuildPack(CondaBuildPack): @@ -86,7 +86,7 @@ class PythonBuildPack(CondaBuildPack): requirements_txt = self.binder_path(name) if not os.path.exists(requirements_txt): continue - with open_utf8convert_read(requirements_txt) as f: + with open_guess_encoding(requirements_txt) as f: for line in f: if is_local_pip_requirement(line): return False diff --git a/repo2docker/utils.py b/repo2docker/utils.py index b17e1691..a5f4863b 100644 --- a/repo2docker/utils.py +++ b/repo2docker/utils.py @@ -70,17 +70,23 @@ def chdir(path): finally: os.chdir(old_dir) + @contextmanager -def open_utf8convert_read(path): +def open_guess_encoding(path): + """ + Open a file in text mode, specifying its encoding, + that we guess using chardet. + """ + detector = chardet.universaldetector.UniversalDetector() with open(path, "rb") as f: - file_to_encode = f.read() + for line in f.readlines(): + detector.feed(line) + print(str(i) + str(detector.done)) + if detector.done: + break + detector.close() - encoding_detection_result = chardet.detect(file_to_encode) - if not "utf-8" in encoding_detection_result: - with open(path, "wb") as f: - f.write(file_to_encode.decode(encoding_detection_result["encoding"]).encode("utf-8")) - - file = open(path) + file = open(path, encoding=detector.result["encoding"]) try: yield file finally: From 795769ee2808f51930902223080c1f67e33819c6 Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Thu, 29 Aug 2019 15:51:12 +0300 Subject: [PATCH 3/5] Removed debug print --- repo2docker/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/repo2docker/utils.py b/repo2docker/utils.py index a5f4863b..f8bf96c0 100644 --- a/repo2docker/utils.py +++ b/repo2docker/utils.py @@ -81,7 +81,6 @@ def open_guess_encoding(path): with open(path, "rb") as f: for line in f.readlines(): detector.feed(line) - print(str(i) + str(detector.done)) if detector.done: break detector.close() From 5e4932b704bad721c92d4f832c58ff14295e1867 Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Fri, 30 Aug 2019 11:06:53 +0300 Subject: [PATCH 4/5] Added test for open_guess_encoding --- tests/unit/test_utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 951f2085..b13dbc4a 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -6,6 +6,7 @@ import os from repo2docker import utils import pytest import subprocess +import tempfile def test_capture_cmd_no_capture_success(): @@ -112,6 +113,14 @@ def test_normalize_doi(): assert utils.normalize_doi("http://dx.doi.org/10.1234/jshd123") == "10.1234/jshd123" +def test_open_guess_encoding(): + data = "Rică nu știa să zică râu, rățușcă, rămurică." + with tempfile.NamedTemporaryFile(mode='wb') as test_file: + test_file.write(str.encode(data, "utf-16")) + test_file.seek(0) + with utils.open_guess_encoding(test_file.name) as fd: + assert fd.read() == data + @pytest.mark.parametrize( "req, is_local", [ From 75f4a70fecbf5bed15bbaaee09c4d33e6e034463 Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Fri, 30 Aug 2019 11:26:48 +0300 Subject: [PATCH 5/5] Black refactor --- tests/unit/test_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index b13dbc4a..6b467315 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -115,12 +115,13 @@ def test_normalize_doi(): def test_open_guess_encoding(): data = "Rică nu știa să zică râu, rățușcă, rămurică." - with tempfile.NamedTemporaryFile(mode='wb') as test_file: + with tempfile.NamedTemporaryFile(mode="wb") as test_file: test_file.write(str.encode(data, "utf-16")) test_file.seek(0) with utils.open_guess_encoding(test_file.name) as fd: assert fd.read() == data + @pytest.mark.parametrize( "req, is_local", [