Combined regex patterns, avoiding multiple steps for minimal processing

pull/15/head
wlski 2024-06-22 19:42:04 -04:00
rodzic 554d922e31
commit 4e863a3a9c
1 zmienionych plików z 5 dodań i 9 usunięć

Wyświetl plik

@ -25,19 +25,15 @@ class CompressionEngine(Enum):
BROTLI = 3 BROTLI = 3
# Precompile regex patterns for reuse # Precompile regex patterns for reuse
SPACE_PATTERN = re.compile(' +') WHITESPACE_PATTERN = re.compile(r'[ \t]+')
TAB_PATTERN = re.compile('\t') NEWLINE_PATTERN = re.compile(r'\n+')
NEWLINE_PATTERN = re.compile('\n+') LEADING_TRAILING_NEWLINE_PATTERN = re.compile(r'(\n )|( \n)')
LEADING_NEWLINE_PATTERN = re.compile('\n ')
TRAILING_NEWLINE_PATTERN = re.compile(' \n')
NON_ALNUM_PATTERN = re.compile(r'[^0-9A-Za-z,\.\(\) \n]') NON_ALNUM_PATTERN = re.compile(r'[^0-9A-Za-z,\.\(\) \n]')
def clean_text(s: str) -> str: def clean_text(s: str) -> str:
s = SPACE_PATTERN.sub(' ', s) s = WHITESPACE_PATTERN.sub(' ', s)
s = TAB_PATTERN.sub('', s)
s = NEWLINE_PATTERN.sub('\n', s) s = NEWLINE_PATTERN.sub('\n', s)
s = LEADING_NEWLINE_PATTERN.sub('\n', s) s = LEADING_TRAILING_NEWLINE_PATTERN.sub('\n', s)
s = TRAILING_NEWLINE_PATTERN.sub('\n', s)
s = NON_ALNUM_PATTERN.sub('', s) s = NON_ALNUM_PATTERN.sub('', s)
return s return s