zippy/lzma_detect.py

#!/usr/bin/env python3

# Code to attempt to detect AT-generated text [relatively] quickly via compression ratios
# (C) 2023 Thinkst Applied Research, PTY
# Author: Jacob Torrey <jacob@thinkst.com>

import lzma, argparse, os
from typing import List, Optional, Tuple

# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
PRELUDE_FILE : str = 'ai-generated.txt'

class LzmaLlmDetector:
    '''Class providing functionality to attempt to detect LLM/generative AI generated text using the LZMA compression algorithm'''
    def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3) -> None:
        '''Initializes a compression with the passed prelude file, and optionally the number of digits to round to compare prelude vs. sample compression'''
        self.comp = lzma.LZMACompressor()
        self.c_buf : List[bytes] = []
        self.in_bytes : int = 0
        self.prelude_ratio : float = 0.0
        self.FUZZINESS_THRESHOLD = fuzziness_digits

        if prelude_file != None:
            # Read it once to get the default compression ratio for the prelude
            with open(prelude_file, 'r') as fp:
                self._compress_str(fp.read())
            self.prelude_ratio = self._finalize()
            # Redo this to prime the compressor
            self.comp = lzma.LZMACompressor()
            with open(prelude_file, 'r') as fp:
                self._compress_str(fp.read())
    
    def _compress_str(self, s : str) -> None:
        '''
        Internal helper function to compress a string
        '''
        strb : bytes = s.encode('utf-8')
        self.c_buf.append(self.comp.compress(strb))
        self.in_bytes += len(strb)
    
    def _finalize(self) -> float:
        '''
        Finalizes an LZMA compression cycle and returns the percentage compression ratio
        
        post: _ >= 0
        '''
        self.c_buf.append(self.comp.flush())
        compressed_size : int = len(b''.join(self.c_buf))
        if self.in_bytes == 0:
            return 0.0
        return compressed_size / self.in_bytes
    
    def get_compression_ratio(self, s : str) -> Tuple[float, float]:
        '''
        Returns a tuple of floats with the compression ratio of the prelude (0 if no prelude) and passed string
        '''
        self._compress_str(s)
        return (self.prelude_ratio, self._finalize())

    def score_text(self, sample : str) -> Optional[Tuple[str, float]]:
        '''
        Returns a tuple of a string (AI or Human) and a float confidence (higher is more confident) that the sample was generated 
        by either an AI or human. Returns None if it cannot make a determination
        '''
        if self.prelude_ratio == 0.0:
            return None
        (prelude_score, sample_score) = self.get_compression_ratio(sample)
        #print(str((prelude_score, sample_score)))
        delta = prelude_score - sample_score
        determination = 'AI'
        if delta < 0 or round(delta, self.FUZZINESS_THRESHOLD) == 0:
            determination = 'Human'
        if abs(delta * 100) < .1 and determination == 'AI':
            print("Very low-confidence determination of: " + determination)
        return (determination, abs(delta * 100))
        
def run_on_file(filename : str, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
    with open(filename, 'r') as fp:
        l = LzmaLlmDetector(PRELUDE_FILE, fuzziness)
        return l.score_text(fp.read())    

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("sample_files", nargs='+', help='Text file(s) containing the sample to classify')
    args = parser.parse_args()
    for f in args.sample_files:
        print(f)
        if os.path.isfile(f):
            print(str(run_on_file(f)))
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`#!/usr/bin/env python3`

			`# Code to attempt to detect AT-generated text [relatively] quickly via compression ratios`
			`# (C) 2023 Thinkst Applied Research, PTY`
			`# Author: Jacob Torrey <jacob@thinkst.com>`

Improve command-line calling and test suite Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 20:32:43 +00:00			`import lzma, argparse, os`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`from typing import List, Optional, Tuple`

Added command-line parser/runner and samples to test with Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:36:43 +00:00			`# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`PRELUDE_FILE : str = 'ai-generated.txt'`

			`class LzmaLlmDetector:`
			`'''Class providing functionality to attempt to detect LLM/generative AI generated text using the LZMA compression algorithm'''`
			`def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3) -> None:`
			`'''Initializes a compression with the passed prelude file, and optionally the number of digits to round to compare prelude vs. sample compression'''`
			`self.comp = lzma.LZMACompressor()`
			`self.c_buf : List[bytes] = []`
			`self.in_bytes : int = 0`
			`self.prelude_ratio : float = 0.0`
			`self.FUZZINESS_THRESHOLD = fuzziness_digits`

			`if prelude_file != None:`
			`# Read it once to get the default compression ratio for the prelude`
			`with open(prelude_file, 'r') as fp:`
			`self._compress_str(fp.read())`
			`self.prelude_ratio = self._finalize()`
			`# Redo this to prime the compressor`
			`self.comp = lzma.LZMACompressor()`
			`with open(prelude_file, 'r') as fp:`
			`self._compress_str(fp.read())`

			`def _compress_str(self, s : str) -> None:`
			`'''`
			`Internal helper function to compress a string`
			`'''`
			`strb : bytes = s.encode('utf-8')`
			`self.c_buf.append(self.comp.compress(strb))`
			`self.in_bytes += len(strb)`

			`def _finalize(self) -> float:`
			`'''`
			`Finalizes an LZMA compression cycle and returns the percentage compression ratio`

			`post: _ >= 0`
			`'''`
			`self.c_buf.append(self.comp.flush())`
			`compressed_size : int = len(b''.join(self.c_buf))`
			`if self.in_bytes == 0:`
			`return 0.0`
			`return compressed_size / self.in_bytes`

			`def get_compression_ratio(self, s : str) -> Tuple[float, float]:`
			`'''`
			`Returns a tuple of floats with the compression ratio of the prelude (0 if no prelude) and passed string`
			`'''`
			`self._compress_str(s)`
			`return (self.prelude_ratio, self._finalize())`

			`def score_text(self, sample : str) -> Optional[Tuple[str, float]]:`
			`'''`
			`Returns a tuple of a string (AI or Human) and a float confidence (higher is more confident) that the sample was generated`
			`by either an AI or human. Returns None if it cannot make a determination`
			`'''`
			`if self.prelude_ratio == 0.0:`
			`return None`
			`(prelude_score, sample_score) = self.get_compression_ratio(sample)`
Added command-line parser/runner and samples to test with Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:36:43 +00:00			`#print(str((prelude_score, sample_score)))`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`delta = prelude_score - sample_score`
			`determination = 'AI'`
			`if delta < 0 or round(delta, self.FUZZINESS_THRESHOLD) == 0:`
			`determination = 'Human'`
Improve command-line calling and test suite Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 20:32:43 +00:00			`if abs(delta * 100) < .1 and determination == 'AI':`
			`print("Very low-confidence determination of: " + determination)`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`return (determination, abs(delta * 100))`

Improve command-line calling and test suite Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 20:32:43 +00:00			`def run_on_file(filename : str, fuzziness : int = 3) -> Optional[Tuple[str, float]]:`
Added testing framework to determine what samples fail Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:46:59 +00:00			`with open(filename, 'r') as fp:`
Improve command-line calling and test suite Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 20:32:43 +00:00			`l = LzmaLlmDetector(PRELUDE_FILE, fuzziness)`
Added testing framework to determine what samples fail Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:46:59 +00:00			`return l.score_text(fp.read())`

Added command-line parser/runner and samples to test with Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:36:43 +00:00			`if __name__ == '__main__':`
			`parser = argparse.ArgumentParser()`
Improve command-line calling and test suite Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 20:32:43 +00:00			`parser.add_argument("sample_files", nargs='+', help='Text file(s) containing the sample to classify')`
Added command-line parser/runner and samples to test with Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:36:43 +00:00			`args = parser.parse_args()`
Improve command-line calling and test suite Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 20:32:43 +00:00			`for f in args.sample_files:`
			`print(f)`
			`if os.path.isfile(f):`
			`print(str(run_on_file(f)))`