kopia lustrzana https://github.com/thinkst/zippy
Combined regex patterns, avoiding multiple steps for minimal processing
rodzic
554d922e31
commit
4e863a3a9c
|
@ -25,19 +25,15 @@ class CompressionEngine(Enum):
|
||||||
BROTLI = 3
|
BROTLI = 3
|
||||||
|
|
||||||
# Precompile regex patterns for reuse
|
# Precompile regex patterns for reuse
|
||||||
SPACE_PATTERN = re.compile(' +')
|
WHITESPACE_PATTERN = re.compile(r'[ \t]+')
|
||||||
TAB_PATTERN = re.compile('\t')
|
NEWLINE_PATTERN = re.compile(r'\n+')
|
||||||
NEWLINE_PATTERN = re.compile('\n+')
|
LEADING_TRAILING_NEWLINE_PATTERN = re.compile(r'(\n )|( \n)')
|
||||||
LEADING_NEWLINE_PATTERN = re.compile('\n ')
|
|
||||||
TRAILING_NEWLINE_PATTERN = re.compile(' \n')
|
|
||||||
NON_ALNUM_PATTERN = re.compile(r'[^0-9A-Za-z,\.\(\) \n]')
|
NON_ALNUM_PATTERN = re.compile(r'[^0-9A-Za-z,\.\(\) \n]')
|
||||||
|
|
||||||
def clean_text(s: str) -> str:
|
def clean_text(s: str) -> str:
|
||||||
s = SPACE_PATTERN.sub(' ', s)
|
s = WHITESPACE_PATTERN.sub(' ', s)
|
||||||
s = TAB_PATTERN.sub('', s)
|
|
||||||
s = NEWLINE_PATTERN.sub('\n', s)
|
s = NEWLINE_PATTERN.sub('\n', s)
|
||||||
s = LEADING_NEWLINE_PATTERN.sub('\n', s)
|
s = LEADING_TRAILING_NEWLINE_PATTERN.sub('\n', s)
|
||||||
s = TRAILING_NEWLINE_PATTERN.sub('\n', s)
|
|
||||||
s = NON_ALNUM_PATTERN.sub('', s)
|
s = NON_ALNUM_PATTERN.sub('', s)
|
||||||
return s
|
return s
|
||||||
|
|
||||||
|
|
Ładowanie…
Reference in New Issue