Strip whitespace, add to ai-generated, and improve tuning parameters

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
pull/6/head
Jacob Torrey 2023-05-11 14:26:59 -06:00
rodzic 77e60befab
commit ae5f458b5f
3 zmienionych plików z 124 dodań i 11 usunięć

File diff suppressed because one or more lines are too long

Wyświetl plik

@ -5,20 +5,25 @@
# Author: Jacob Torrey <jacob@thinkst.com>
import lzma, argparse, os
import re
from typing import List, Optional, Tuple
# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
PRELUDE_FILE : str = 'ai-generated.txt'
with open(PRELUDE_FILE, 'r') as fp:
PRELUDE_STR = fp.read()
class LzmaLlmDetector:
'''Class providing functionality to attempt to detect LLM/generative AI generated text using the LZMA compression algorithm'''
def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3) -> None:
def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3, prelude_str : Optional[str] = None) -> None:
'''Initializes a compression with the passed prelude file, and optionally the number of digits to round to compare prelude vs. sample compression'''
self.comp = lzma.LZMACompressor()
self.PRESET : int = 0
self.comp = lzma.LZMACompressor(preset=self.PRESET)
self.c_buf : List[bytes] = []
self.in_bytes : int = 0
self.prelude_ratio : float = 0.0
self.FUZZINESS_THRESHOLD = fuzziness_digits
self.SHORT_SAMPLE_THRESHOLD : int = 350 # What sample length is considered "short"
if prelude_file != None:
# Read it once to get the default compression ratio for the prelude
@ -26,9 +31,15 @@ class LzmaLlmDetector:
self._compress_str(fp.read())
self.prelude_ratio = self._finalize()
# Redo this to prime the compressor
self.comp = lzma.LZMACompressor()
self.comp = lzma.LZMACompressor(preset=self.PRESET)
with open(prelude_file, 'r') as fp:
self._compress_str(fp.read())
if prelude_str != None:
self._compress_str(prelude_str)
self.prelude_ratio = self._finalize()
self.comp = lzma.LZMACompressor(preset=self.PRESET)
self._compress_str(prelude_str)
def _compress_str(self, s : str) -> None:
'''
@ -68,7 +79,15 @@ class LzmaLlmDetector:
#print(str((prelude_score, sample_score)))
delta = prelude_score - sample_score
determination = 'AI'
if delta < 0 or round(delta, self.FUZZINESS_THRESHOLD) == 0:
if delta < 0:
determination = 'Human'
# If the sample doesn't 'move the needle', it's very close
if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) >= self.SHORT_SAMPLE_THRESHOLD:
#print('Sample len to default to AI: ' + str(len(sample)))
determination = 'AI'
if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) < self.SHORT_SAMPLE_THRESHOLD:
#print('Sample len to default to Human: ' + str(len(sample)))
determination = 'Human'
#if abs(delta * 100) < .1 and determination == 'AI':
# print("Very low-confidence determination of: " + determination)
@ -90,6 +109,13 @@ def run_on_file_chunked(filename : str, chunk_size : int = 1024, fuzziness : int
'''
with open(filename, 'r') as fp:
contents = fp.read()
# Remove extra spaces and duplicate newlines.
contents = re.sub(' +', ' ', contents)
contents = re.sub('\t', '', contents)
contents = re.sub('\n+', '\n', contents)
contents = re.sub('\n ', '\n', contents)
start = 0
end = 0
chunks = []
@ -100,7 +126,7 @@ def run_on_file_chunked(filename : str, chunk_size : int = 1024, fuzziness : int
chunks.append(contents[start:])
scores = []
for c in chunks:
l = LzmaLlmDetector(PRELUDE_FILE, fuzziness)
l = LzmaLlmDetector(fuzziness_digits=fuzziness, prelude_str=PRELUDE_STR)
scores.append(l.score_text(c))
ssum : float = 0.0
for s in scores:
@ -119,6 +145,7 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("sample_files", nargs='+', help='Text file(s) containing the sample to classify')
args = parser.parse_args()
for f in args.sample_files:
print(f)
if os.path.isfile(f):

Wyświetl plik

@ -1,6 +1,7 @@
#!/usr/bin/env python3
import pytest, os
from warnings import warn
from lzma_detect import run_on_file_chunked
AI_SAMPLE_DIR = 'samples/llm-generated/'
@ -9,15 +10,30 @@ HUMAN_SAMPLE_DIR = 'samples/human-generated/'
ai_files = os.listdir(AI_SAMPLE_DIR)
human_files = os.listdir(HUMAN_SAMPLE_DIR)
FUZZINESS = 3
CONFIDENCE_THRESHOLD : float = 0.00 # What confidence to treat as error vs warning
def test_training_file():
assert run_on_file('ai-generated.txt')[0] == 'AI', 'The training corpus should always be detected as AI-generated... since it is'
assert run_on_file_chunked('ai-generated.txt')[0] == 'AI', 'The training corpus should always be detected as AI-generated... since it is'
@pytest.mark.parametrize('f', human_files)
def test_human_samples(f):
(classification, score) = run_on_file_chunked(HUMAN_SAMPLE_DIR + f)
assert classification == 'Human', f + ' is a human-generated file, misclassified as AI-generated with confidence ' + str(round(score, 8))
(classification, score) = run_on_file_chunked(HUMAN_SAMPLE_DIR + f, fuzziness=FUZZINESS)
if score > CONFIDENCE_THRESHOLD:
assert classification == 'Human', f + ' is a human-generated file, misclassified as AI-generated with confidence ' + str(round(score, 8))
else:
if classification != 'Human':
warn("Misclassified " + f + " with score of: " + str(round(score, 8)))
else:
warn("Unable to confidently classify: " + f)
@pytest.mark.parametrize('f', ai_files)
def test_llm_sample(f):
(classification, score) = run_on_file_chunked(AI_SAMPLE_DIR + f)
assert classification == 'AI', f + ' is an LLM-generated file, misclassified as human-generated with confidence ' + str(round(score, 8))
(classification, score) = run_on_file_chunked(AI_SAMPLE_DIR + f, fuzziness=FUZZINESS)
if score > CONFIDENCE_THRESHOLD:
assert classification == 'AI', f + ' is an LLM-generated file, misclassified as human-generated with confidence ' + str(round(score, 8))
else:
if classification != 'AI':
warn("Misclassified " + f + " with score of: " + str(round(score, 8)))
else:
warn("Unable to confidently classify: " + f)