2023-05-10 16:56:02 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
# Code to attempt to detect AT-generated text [relatively] quickly via compression ratios
|
|
|
|
# (C) 2023 Thinkst Applied Research, PTY
|
|
|
|
# Author: Jacob Torrey <jacob@thinkst.com>
|
|
|
|
|
2023-05-10 20:32:43 +00:00
|
|
|
import lzma, argparse, os
|
2023-05-10 16:56:02 +00:00
|
|
|
from typing import List, Optional, Tuple
|
|
|
|
|
2023-05-10 17:36:43 +00:00
|
|
|
# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
|
2023-05-10 16:56:02 +00:00
|
|
|
PRELUDE_FILE : str = 'ai-generated.txt'
|
|
|
|
|
|
|
|
class LzmaLlmDetector:
|
|
|
|
'''Class providing functionality to attempt to detect LLM/generative AI generated text using the LZMA compression algorithm'''
|
|
|
|
def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3) -> None:
|
|
|
|
'''Initializes a compression with the passed prelude file, and optionally the number of digits to round to compare prelude vs. sample compression'''
|
|
|
|
self.comp = lzma.LZMACompressor()
|
|
|
|
self.c_buf : List[bytes] = []
|
|
|
|
self.in_bytes : int = 0
|
|
|
|
self.prelude_ratio : float = 0.0
|
|
|
|
self.FUZZINESS_THRESHOLD = fuzziness_digits
|
|
|
|
|
|
|
|
if prelude_file != None:
|
|
|
|
# Read it once to get the default compression ratio for the prelude
|
|
|
|
with open(prelude_file, 'r') as fp:
|
|
|
|
self._compress_str(fp.read())
|
|
|
|
self.prelude_ratio = self._finalize()
|
|
|
|
# Redo this to prime the compressor
|
|
|
|
self.comp = lzma.LZMACompressor()
|
|
|
|
with open(prelude_file, 'r') as fp:
|
|
|
|
self._compress_str(fp.read())
|
|
|
|
|
|
|
|
def _compress_str(self, s : str) -> None:
|
|
|
|
'''
|
|
|
|
Internal helper function to compress a string
|
|
|
|
'''
|
|
|
|
strb : bytes = s.encode('utf-8')
|
|
|
|
self.c_buf.append(self.comp.compress(strb))
|
|
|
|
self.in_bytes += len(strb)
|
|
|
|
|
|
|
|
def _finalize(self) -> float:
|
|
|
|
'''
|
|
|
|
Finalizes an LZMA compression cycle and returns the percentage compression ratio
|
|
|
|
|
|
|
|
post: _ >= 0
|
|
|
|
'''
|
|
|
|
self.c_buf.append(self.comp.flush())
|
|
|
|
compressed_size : int = len(b''.join(self.c_buf))
|
|
|
|
if self.in_bytes == 0:
|
|
|
|
return 0.0
|
|
|
|
return compressed_size / self.in_bytes
|
|
|
|
|
|
|
|
def get_compression_ratio(self, s : str) -> Tuple[float, float]:
|
|
|
|
'''
|
|
|
|
Returns a tuple of floats with the compression ratio of the prelude (0 if no prelude) and passed string
|
|
|
|
'''
|
|
|
|
self._compress_str(s)
|
|
|
|
return (self.prelude_ratio, self._finalize())
|
|
|
|
|
|
|
|
def score_text(self, sample : str) -> Optional[Tuple[str, float]]:
|
|
|
|
'''
|
|
|
|
Returns a tuple of a string (AI or Human) and a float confidence (higher is more confident) that the sample was generated
|
|
|
|
by either an AI or human. Returns None if it cannot make a determination
|
|
|
|
'''
|
|
|
|
if self.prelude_ratio == 0.0:
|
|
|
|
return None
|
|
|
|
(prelude_score, sample_score) = self.get_compression_ratio(sample)
|
2023-05-10 17:36:43 +00:00
|
|
|
#print(str((prelude_score, sample_score)))
|
2023-05-10 16:56:02 +00:00
|
|
|
delta = prelude_score - sample_score
|
|
|
|
determination = 'AI'
|
|
|
|
if delta < 0 or round(delta, self.FUZZINESS_THRESHOLD) == 0:
|
|
|
|
determination = 'Human'
|
2023-05-10 20:32:43 +00:00
|
|
|
if abs(delta * 100) < .1 and determination == 'AI':
|
|
|
|
print("Very low-confidence determination of: " + determination)
|
2023-05-10 16:56:02 +00:00
|
|
|
return (determination, abs(delta * 100))
|
|
|
|
|
2023-05-10 20:32:43 +00:00
|
|
|
def run_on_file(filename : str, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
|
2023-05-10 17:46:59 +00:00
|
|
|
with open(filename, 'r') as fp:
|
2023-05-10 20:32:43 +00:00
|
|
|
l = LzmaLlmDetector(PRELUDE_FILE, fuzziness)
|
2023-05-10 17:46:59 +00:00
|
|
|
return l.score_text(fp.read())
|
|
|
|
|
2023-05-10 17:36:43 +00:00
|
|
|
if __name__ == '__main__':
|
|
|
|
parser = argparse.ArgumentParser()
|
2023-05-10 20:32:43 +00:00
|
|
|
parser.add_argument("sample_files", nargs='+', help='Text file(s) containing the sample to classify')
|
2023-05-10 17:36:43 +00:00
|
|
|
args = parser.parse_args()
|
2023-05-10 20:32:43 +00:00
|
|
|
for f in args.sample_files:
|
|
|
|
print(f)
|
|
|
|
if os.path.isfile(f):
|
|
|
|
print(str(run_on_file(f)))
|