Improve performance by farming out chunks to other processes with multiprocessing

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
pull/6/head
Jacob Torrey 2023-05-12 08:20:29 -06:00
rodzic ee631945b6
commit bdc558756a
2 zmienionych plików z 29 dodań i 14 usunięć

Wyświetl plik

@ -4,9 +4,10 @@
# (C) 2023 Thinkst Applied Research, PTY
# Author: Jacob Torrey <jacob@thinkst.com>
import lzma, argparse, os
import lzma, argparse, os, itertools
import re
from typing import List, Optional, Tuple
from multiprocessing import Pool, cpu_count
# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
PRELUDE_FILE : str = 'ai-generated.txt'
@ -15,13 +16,16 @@ with open(PRELUDE_FILE, 'r') as fp:
class LzmaLlmDetector:
'''Class providing functionality to attempt to detect LLM/generative AI generated text using the LZMA compression algorithm'''
def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3, prelude_str : Optional[str] = None) -> None:
def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3, prelude_str : Optional[str] = None, prelude_ratio : Optional[float] = None) -> None:
'''Initializes a compression with the passed prelude file, and optionally the number of digits to round to compare prelude vs. sample compression'''
self.PRESET : int = 0
self.PRESET : int = 2
self.comp = lzma.LZMACompressor(preset=self.PRESET)
self.c_buf : List[bytes] = []
self.in_bytes : int = 0
self.prelude_ratio : float = 0.0
if prelude_ratio is None:
self.prelude_ratio : float = 0.0
else:
self.prelude_ratio : float = prelude_ratio
self.FUZZINESS_THRESHOLD = fuzziness_digits
self.SHORT_SAMPLE_THRESHOLD : int = 350 # What sample length is considered "short"
@ -36,9 +40,10 @@ class LzmaLlmDetector:
self._compress_str(fp.read())
if prelude_str != None:
self._compress_str(prelude_str)
self.prelude_ratio = self._finalize()
self.comp = lzma.LZMACompressor(preset=self.PRESET)
if self.prelude_ratio == 0.0:
self._compress_str(prelude_str)
self.prelude_ratio = self._finalize()
self.comp = lzma.LZMACompressor(preset=self.PRESET)
self._compress_str(prelude_str)
def _compress_str(self, s : str) -> None:
@ -101,7 +106,11 @@ def run_on_file(filename : str, fuzziness : int = 3) -> Optional[Tuple[str, floa
#print('Calculating score for input of length ' + str(len(txt)))
return l.score_text(txt)
def run_on_file_chunked(filename : str, chunk_size : int = 1024, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
def _score_chunk(c : str, fuzziness : int = 3, prelude_ratio : Optional[float] = None) -> Tuple[str, float]:
l = LzmaLlmDetector(fuzziness_digits=fuzziness, prelude_str=PRELUDE_STR, prelude_ratio=prelude_ratio)
return l.score_text(c)
def run_on_file_chunked(filename : str, chunk_size : int = 1024, fuzziness : int = 3, prelude_ratio : Optional[float] = None) -> Optional[Tuple[str, float]]:
'''
Given a filename (and an optional chunk size and number of decimal places to round to) returns the score for the contents of that file.
This function chunks the file into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
@ -125,9 +134,13 @@ def run_on_file_chunked(filename : str, chunk_size : int = 1024, fuzziness : int
start = end + 1
chunks.append(contents[start:])
scores = []
for c in chunks:
l = LzmaLlmDetector(fuzziness_digits=fuzziness, prelude_str=PRELUDE_STR)
scores.append(l.score_text(c))
if len(chunks) > 2:
with Pool(cpu_count()) as pool:
for r in pool.starmap(_score_chunk, zip(chunks, itertools.repeat(fuzziness), itertools.repeat(prelude_ratio))):
scores.append(r)
else:
for c in chunks:
scores.append(_score_chunk(c, fuzziness=fuzziness, prelude_ratio=prelude_ratio))
ssum : float = 0.0
for s in scores:
if s[0] == 'AI':

Wyświetl plik

@ -2,7 +2,7 @@
import pytest, os
from warnings import warn
from lzma_detect import run_on_file_chunked
from lzma_detect import run_on_file_chunked, PRELUDE_STR, LzmaLlmDetector
AI_SAMPLE_DIR = 'samples/llm-generated/'
HUMAN_SAMPLE_DIR = 'samples/human-generated/'
@ -13,12 +13,14 @@ human_files = os.listdir(HUMAN_SAMPLE_DIR)
FUZZINESS = 3
CONFIDENCE_THRESHOLD : float = 0.00 # What confidence to treat as error vs warning
PRELUDE_RATIO = LzmaLlmDetector(prelude_str=PRELUDE_STR).prelude_ratio
def test_training_file():
assert run_on_file_chunked('ai-generated.txt')[0] == 'AI', 'The training corpus should always be detected as AI-generated... since it is'
@pytest.mark.parametrize('f', human_files)
def test_human_samples(f):
(classification, score) = run_on_file_chunked(HUMAN_SAMPLE_DIR + f, fuzziness=FUZZINESS)
(classification, score) = run_on_file_chunked(HUMAN_SAMPLE_DIR + f, fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
if score > CONFIDENCE_THRESHOLD:
assert classification == 'Human', f + ' is a human-generated file, misclassified as AI-generated with confidence ' + str(round(score, 8))
else:
@ -29,7 +31,7 @@ def test_human_samples(f):
@pytest.mark.parametrize('f', ai_files)
def test_llm_sample(f):
(classification, score) = run_on_file_chunked(AI_SAMPLE_DIR + f, fuzziness=FUZZINESS)
(classification, score) = run_on_file_chunked(AI_SAMPLE_DIR + f, fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
if score > CONFIDENCE_THRESHOLD:
assert classification == 'AI', f + ' is an LLM-generated file, misclassified as human-generated with confidence ' + str(round(score, 8))
else: