Precompile regex patterns for reuse

pull/15/head
wlski 2024-06-22 19:39:32 -04:00
rodzic b8c60a105e
commit 554d922e31
1 zmienionych plików z 15 dodań i 14 usunięć

Wyświetl plik

@ -24,20 +24,21 @@ class CompressionEngine(Enum):
ZLIB = 2
BROTLI = 3
def clean_text(s : str) -> str:
'''
Removes formatting and other non-content data that may skew compression ratios (e.g., duplicate spaces)
'''
# Remove extra spaces and duplicate newlines.
s = re.sub(' +', ' ', s)
s = re.sub('\t', '', s)
s = re.sub('\n+', '\n', s)
s = re.sub('\n ', '\n', s)
s = re.sub(' \n', '\n', s)
# Remove non-alphanumeric chars
s = re.sub(r'[^0-9A-Za-z,\.\(\) \n]', '', s)#.lower()
# Precompile regex patterns for reuse
SPACE_PATTERN = re.compile(' +')
TAB_PATTERN = re.compile('\t')
NEWLINE_PATTERN = re.compile('\n+')
LEADING_NEWLINE_PATTERN = re.compile('\n ')
TRAILING_NEWLINE_PATTERN = re.compile(' \n')
NON_ALNUM_PATTERN = re.compile(r'[^0-9A-Za-z,\.\(\) \n]')
def clean_text(s: str) -> str:
s = SPACE_PATTERN.sub(' ', s)
s = TAB_PATTERN.sub('', s)
s = NEWLINE_PATTERN.sub('\n', s)
s = LEADING_NEWLINE_PATTERN.sub('\n', s)
s = TRAILING_NEWLINE_PATTERN.sub('\n', s)
s = NON_ALNUM_PATTERN.sub('', s)
return s
# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
@ -393,4 +394,4 @@ def main():
print(str(z.run_on_file_chunked(f)))
if __name__ == '__main__':
main()
main()