From 4e863a3a9cf1abe90ff925eae773bc1e7cc6e57f Mon Sep 17 00:00:00 2001 From: wlski <105810032+wlski@users.noreply.github.com> Date: Sat, 22 Jun 2024 19:42:04 -0400 Subject: [PATCH] Combined regex patterns, avoiding multiple steps for minimal processing --- zippy/zippy.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/zippy/zippy.py b/zippy/zippy.py index d2f2cbe..f916086 100755 --- a/zippy/zippy.py +++ b/zippy/zippy.py @@ -25,19 +25,15 @@ class CompressionEngine(Enum): BROTLI = 3 # Precompile regex patterns for reuse -SPACE_PATTERN = re.compile(' +') -TAB_PATTERN = re.compile('\t') -NEWLINE_PATTERN = re.compile('\n+') -LEADING_NEWLINE_PATTERN = re.compile('\n ') -TRAILING_NEWLINE_PATTERN = re.compile(' \n') +WHITESPACE_PATTERN = re.compile(r'[ \t]+') +NEWLINE_PATTERN = re.compile(r'\n+') +LEADING_TRAILING_NEWLINE_PATTERN = re.compile(r'(\n )|( \n)') NON_ALNUM_PATTERN = re.compile(r'[^0-9A-Za-z,\.\(\) \n]') def clean_text(s: str) -> str: - s = SPACE_PATTERN.sub(' ', s) - s = TAB_PATTERN.sub('', s) + s = WHITESPACE_PATTERN.sub(' ', s) s = NEWLINE_PATTERN.sub('\n', s) - s = LEADING_NEWLINE_PATTERN.sub('\n', s) - s = TRAILING_NEWLINE_PATTERN.sub('\n', s) + s = LEADING_TRAILING_NEWLINE_PATTERN.sub('\n', s) s = NON_ALNUM_PATTERN.sub('', s) return s