Strip whitespace, add to ai-generated, and improve tuning parameters

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
2023-05-11 14:26:59 -06:00 · 2023-05-11 14:26:59 -06:00 · ae5f458b5f
commit ae5f458b5f
--- a/ai-generated.txt
+++ b/ai-generated.txt
--- a/lzma_detect.py
+++ b/lzma_detect.py
@ -5,20 +5,25 @@
 # Author: Jacob Torrey <jacob@thinkst.com>

 import lzma, argparse, os
+import re
 from typing import List, Optional, Tuple

 # The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
 PRELUDE_FILE : str = 'ai-generated.txt'
+with open(PRELUDE_FILE, 'r') as fp:
+    PRELUDE_STR = fp.read()

 class LzmaLlmDetector:
    '''Class providing functionality to attempt to detect LLM/generative AI generated text using the LZMA compression algorithm'''
-    def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3) -> None:
+    def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3, prelude_str : Optional[str] = None) -> None:
        '''Initializes a compression with the passed prelude file, and optionally the number of digits to round to compare prelude vs. sample compression'''
-        self.comp = lzma.LZMACompressor()
+        self.PRESET : int = 0
+        self.comp = lzma.LZMACompressor(preset=self.PRESET)
        self.c_buf : List[bytes] = []
        self.in_bytes : int = 0
        self.prelude_ratio : float = 0.0
        self.FUZZINESS_THRESHOLD = fuzziness_digits
+        self.SHORT_SAMPLE_THRESHOLD : int = 350 # What sample length is considered "short"

        if prelude_file != None:
            # Read it once to get the default compression ratio for the prelude
@ -26,9 +31,15 @@ class LzmaLlmDetector:
                self._compress_str(fp.read())
            self.prelude_ratio = self._finalize()
            # Redo this to prime the compressor
-            self.comp = lzma.LZMACompressor()
+            self.comp = lzma.LZMACompressor(preset=self.PRESET)
            with open(prelude_file, 'r') as fp:
                self._compress_str(fp.read())
+
+        if prelude_str != None:
+            self._compress_str(prelude_str)
+            self.prelude_ratio = self._finalize()
+            self.comp = lzma.LZMACompressor(preset=self.PRESET)
+            self._compress_str(prelude_str)
    
    def _compress_str(self, s : str) -> None:
        '''
@ -68,7 +79,15 @@ class LzmaLlmDetector:
        #print(str((prelude_score, sample_score)))
        delta = prelude_score - sample_score
        determination = 'AI'
-        if delta < 0 or round(delta, self.FUZZINESS_THRESHOLD) == 0:
+        if delta < 0:
+            determination = 'Human'
+
+        # If the sample doesn't 'move the needle', it's very close
+        if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) >= self.SHORT_SAMPLE_THRESHOLD:
+            #print('Sample len to default to AI: ' + str(len(sample)))
+            determination = 'AI'
+        if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) < self.SHORT_SAMPLE_THRESHOLD:
+            #print('Sample len to default to Human: ' + str(len(sample)))
            determination = 'Human'
        #if abs(delta * 100) < .1 and determination == 'AI':
        #    print("Very low-confidence determination of: " + determination)
@ -90,6 +109,13 @@ def run_on_file_chunked(filename : str, chunk_size : int = 1024, fuzziness : int
    '''
    with open(filename, 'r') as fp:
        contents = fp.read()
+    
+    # Remove extra spaces and duplicate newlines.
+    contents = re.sub(' +', ' ', contents)
+    contents = re.sub('\t', '', contents)
+    contents = re.sub('\n+', '\n', contents)
+    contents = re.sub('\n ', '\n', contents)
+
    start = 0
    end = 0
    chunks = []
@ -100,7 +126,7 @@ def run_on_file_chunked(filename : str, chunk_size : int = 1024, fuzziness : int
    chunks.append(contents[start:])
    scores = []
    for c in chunks:
-        l = LzmaLlmDetector(PRELUDE_FILE, fuzziness)
+        l = LzmaLlmDetector(fuzziness_digits=fuzziness, prelude_str=PRELUDE_STR)
        scores.append(l.score_text(c))
    ssum : float = 0.0
    for s in scores:
@ -119,6 +145,7 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("sample_files", nargs='+', help='Text file(s) containing the sample to classify')
    args = parser.parse_args()
+
    for f in args.sample_files:
        print(f)
        if os.path.isfile(f):
--- a/test_lzma_detect.py
+++ b/test_lzma_detect.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3

 import pytest, os
+from warnings import warn
 from lzma_detect import run_on_file_chunked

 AI_SAMPLE_DIR = 'samples/llm-generated/'
@ -9,15 +10,30 @@ HUMAN_SAMPLE_DIR = 'samples/human-generated/'
 ai_files = os.listdir(AI_SAMPLE_DIR)
 human_files = os.listdir(HUMAN_SAMPLE_DIR)

+FUZZINESS = 3
+CONFIDENCE_THRESHOLD : float = 0.00 # What confidence to treat as error vs warning
+
 def test_training_file():
-    assert run_on_file('ai-generated.txt')[0] == 'AI', 'The training corpus should always be detected as AI-generated... since it is'
+    assert run_on_file_chunked('ai-generated.txt')[0] == 'AI', 'The training corpus should always be detected as AI-generated... since it is'

@pytest.mark.parametrize('f', human_files)
 def test_human_samples(f):
-    (classification, score) = run_on_file_chunked(HUMAN_SAMPLE_DIR + f)
-    assert classification == 'Human', f + ' is a human-generated file, misclassified as AI-generated with confidence ' + str(round(score, 8))
+    (classification, score) = run_on_file_chunked(HUMAN_SAMPLE_DIR + f, fuzziness=FUZZINESS)
+    if score > CONFIDENCE_THRESHOLD:
+        assert classification == 'Human', f + ' is a human-generated file, misclassified as AI-generated with confidence ' + str(round(score, 8))
+    else:
+        if classification != 'Human':
+            warn("Misclassified " + f + " with score of: " + str(round(score, 8)))
+        else:
+            warn("Unable to confidently classify: " + f)

@pytest.mark.parametrize('f', ai_files)
 def test_llm_sample(f):
-    (classification, score) = run_on_file_chunked(AI_SAMPLE_DIR + f)
-    assert classification == 'AI', f + ' is an LLM-generated file, misclassified as human-generated with confidence ' + str(round(score, 8))
+   (classification, score) = run_on_file_chunked(AI_SAMPLE_DIR + f, fuzziness=FUZZINESS)
+   if score > CONFIDENCE_THRESHOLD:
+       assert classification == 'AI', f + ' is an LLM-generated file, misclassified as human-generated with confidence ' + str(round(score, 8))
+   else:
+       if classification != 'AI':
+           warn("Misclassified " + f + " with score of: " + str(round(score, 8)))
+       else:
+           warn("Unable to confidently classify: " + f)