Precompile regex patterns for reuse

2024-06-22 19:39:32 -04:00 · 2024-06-22 19:39:32 -04:00 · 554d922e31
commit 554d922e31
--- a/zippy/zippy.py
+++ b/zippy/zippy.py
@ -24,20 +24,21 @@ class CompressionEngine(Enum):
    ZLIB = 2
    BROTLI = 3

-def clean_text(s : str) -> str:
-    '''
-    Removes formatting and other non-content data that may skew compression ratios (e.g., duplicate spaces)
-    '''
-    # Remove extra spaces and duplicate newlines.
-    s = re.sub(' +', ' ', s)
-    s = re.sub('\t', '', s)
-    s = re.sub('\n+', '\n', s)
-    s = re.sub('\n ', '\n', s)
-    s = re.sub(' \n', '\n', s)
-
-    # Remove non-alphanumeric chars
-    s = re.sub(r'[^0-9A-Za-z,\.\(\) \n]', '', s)#.lower()
+# Precompile regex patterns for reuse
+SPACE_PATTERN = re.compile(' +')
+TAB_PATTERN = re.compile('\t')
+NEWLINE_PATTERN = re.compile('\n+')
+LEADING_NEWLINE_PATTERN = re.compile('\n ')
+TRAILING_NEWLINE_PATTERN = re.compile(' \n')
+NON_ALNUM_PATTERN = re.compile(r'[^0-9A-Za-z,\.\(\) \n]')

+def clean_text(s: str) -> str:
+    s = SPACE_PATTERN.sub(' ', s)
+    s = TAB_PATTERN.sub('', s)
+    s = NEWLINE_PATTERN.sub('\n', s)
+    s = LEADING_NEWLINE_PATTERN.sub('\n', s)
+    s = TRAILING_NEWLINE_PATTERN.sub('\n', s)
+    s = NON_ALNUM_PATTERN.sub('', s)
    return s

 # The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
@ -393,4 +394,4 @@ def main():
                print(str(z.run_on_file_chunked(f)))

 if __name__ == '__main__':
-    main()
+    main()