zippy/zippy.py

#!/usr/bin/env python3

# Code to attempt to detect AT-generated text [relatively] quickly via compression ratios
# (C) 2023 Thinkst Applied Research, PTY
# Author: Jacob Torrey <jacob@thinkst.com>

import lzma, argparse, os, itertools
import re, sys
from typing import List, Optional, Tuple
from multiprocessing import Pool, cpu_count

def clean_text(s : str) -> str:
    '''
    Removes formatting and other non-content data that may skew compression ratios (e.g., duplicate spaces)
    '''
    # Remove extra spaces and duplicate newlines.
    s = re.sub(' +', ' ', s)
    s = re.sub('\t', '', s)
    s = re.sub('\n+', '\n', s)
    s = re.sub('\n ', '\n', s)
    s = re.sub(' \n', '\n', s)

    # Remove non-alphanumeric chars
    s = re.sub('[^0-9A-Za-z,\.\(\) \n]', '', s)#.lower()

    return s

def to_english(s : str) -> str:
    '''
    Remove non-English (or names) from an input
    '''

    return s

# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
PRELUDE_FILE : str = 'ai-generated.txt'
with open(PRELUDE_FILE, 'r') as fp:
    PRELUDE_STR = clean_text(fp.read())

class LzmaLlmDetector:
    '''Class providing functionality to attempt to detect LLM/generative AI generated text using the LZMA compression algorithm'''
    def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3, prelude_str : Optional[str] = None, prelude_ratio : Optional[float] = None) -> None:
        '''Initializes a compression with the passed prelude file, and optionally the number of digits to round to compare prelude vs. sample compression'''
        self.PRESET : int = 1
        self.comp = lzma.LZMACompressor(preset=self.PRESET)
        self.c_buf : List[bytes] = []
        self.in_bytes : int = 0
        self.prelude_ratio : float = 0.0
        if prelude_ratio != None:
            self.prelude_ratio = prelude_ratio
        self.FUZZINESS_THRESHOLD = fuzziness_digits
        self.SHORT_SAMPLE_THRESHOLD : int = 350 # What sample length is considered "short"

        if prelude_file != None:
            # Read it once to get the default compression ratio for the prelude
            with open(prelude_file, 'r') as fp:
                self._compress_str(fp.read())
            self.prelude_ratio = self._finalize()
            # Redo this to prime the compressor
            self.comp = lzma.LZMACompressor(preset=self.PRESET)
            with open(prelude_file, 'r') as fp:
                self._compress_str(fp.read())

        if prelude_str != None:
            if self.prelude_ratio == 0.0:
                self._compress_str(prelude_str)
                self.prelude_ratio = self._finalize()
                self.comp = lzma.LZMACompressor(preset=self.PRESET)
            self._compress_str(prelude_str)
            
    def _compress_str(self, s : str) -> None:
        '''
        Internal helper function to compress a string
        '''
        strb : bytes = s.encode('ascii', errors='ignore')
        self.c_buf.append(self.comp.compress(strb))
        self.in_bytes += len(strb)
    
    def _finalize(self) -> float:
        '''
        Finalizes an LZMA compression cycle and returns the percentage compression ratio
        
        post: _ >= 0
        '''
        self.c_buf.append(self.comp.flush())
        compressed_size : int = len(b''.join(self.c_buf))
        if self.in_bytes == 0:
            return 0.0
        score = compressed_size / self.in_bytes
        self.in_bytes = 0
        self.c_buf = []
        return score
    
    def get_compression_ratio(self, s : str) -> Tuple[float, float]:
        '''
        Returns a tuple of floats with the compression ratio of the prelude (0 if no prelude) and passed string
        '''
        self._compress_str(s)
        return (self.prelude_ratio, self._finalize())

    def score_text(self, sample : str) -> Optional[Tuple[str, float]]:
        '''
        Returns a tuple of a string (AI or Human) and a float confidence (higher is more confident) that the sample was generated 
        by either an AI or human. Returns None if it cannot make a determination
        '''
        if self.prelude_ratio == 0.0:
            return None
        (prelude_score, sample_score) = self.get_compression_ratio(sample)
        #print(str((prelude_score, sample_score)))
        delta = prelude_score - sample_score
        determination = 'AI'
        if delta < 0:
            determination = 'Human'

        # If the sample doesn't 'move the needle', it's very close
        # if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) >= self.SHORT_SAMPLE_THRESHOLD:
        #     #print('Sample len to default to AI: ' + str(len(sample)))
        #     determination = 'AI'
        # if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) < self.SHORT_SAMPLE_THRESHOLD:
        #     #print('Sample len to default to Human: ' + str(len(sample)))
        #     determination = 'Human'
        #if abs(delta * 100) < .1 and determination == 'AI':
        #    print("Very low-confidence determination of: " + determination)
        return (determination, abs(delta * 100))
        
def run_on_file(filename : str, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
    '''Given a filename (and an optional number of decimal places to round to) returns the score for the contents of that file'''
    with open(filename, 'r') as fp:
        l = LzmaLlmDetector(PRELUDE_FILE, fuzziness)
        txt = fp.read()
        #print('Calculating score for input of length ' + str(len(txt)))
        return l.score_text(txt)

def _score_chunk(c : str, fuzziness : int = 3, prelude_ratio : Optional[float] = None) -> Tuple[str, float]:
        l = LzmaLlmDetector(fuzziness_digits=fuzziness, prelude_str=PRELUDE_STR, prelude_ratio=prelude_ratio)
        return l.score_text(c)

def run_on_file_chunked(filename : str, chunk_size : int = 1500, fuzziness : int = 3, prelude_ratio : Optional[float] = None) -> Optional[Tuple[str, float]]:
    '''
    Given a filename (and an optional chunk size and number of decimal places to round to) returns the score for the contents of that file.
    This function chunks the file into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
    being skewed because its compression ratio starts to overwhelm the prelude file.
    '''
    with open(filename, 'r') as fp:
        contents = fp.read()
    return run_on_text_chunked(contents, chunk_size, fuzziness, prelude_ratio)

def run_on_text_chunked(s : str, chunk_size : int = 1500, fuzziness : int = 3, prelude_ratio : Optional[float] = None) -> Optional[Tuple[str, float]]:
    '''
    Given a string (and an optional chunk size and number of decimal places to round to) returns the score for the passed string.
    This function chunks the input into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
    being skewed because its compression ratio starts to overwhelm the prelude file.
    '''
    contents = clean_text(s)

    start = 0
    end = 0
    chunks = []
    while start + chunk_size < len(contents) and end != -1:
        end = contents.rfind(' ', start, start + chunk_size + 1)
        chunks.append(contents[start:end])
        start = end + 1
    chunks.append(contents[start:])
    scores = []
    if len(chunks) > 2:
        with Pool(cpu_count()) as pool:
            for r in pool.starmap(_score_chunk, zip(chunks, itertools.repeat(fuzziness), itertools.repeat(prelude_ratio))):
                scores.append(r)
    else:
        for c in chunks:
            scores.append(_score_chunk(c, fuzziness=fuzziness, prelude_ratio=prelude_ratio))
    ssum : float = 0.0
    for i, s in enumerate(scores):
        if s[0] == 'AI':
            ssum -= s[1] * (len(chunks[i]) / len(contents))
        else:
            ssum += s[1] * (len(chunks[i]) / len(contents))
    sa : float = ssum# / len(scores)
    if sa < 0:
        return ('AI', abs(sa))
    else:
        return ('Human', abs(sa))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group()
    group.add_argument("-s", help='Read from stdin until EOF is reached instead of from a file', required=False, action='store_true')
    group.add_argument("sample_files", nargs='*', help='Text file(s) containing the sample to classify', default="")
    args = parser.parse_args()
    if args.s:
        print(str(run_on_text_chunked(''.join(list(sys.stdin)))))
    else:
        for f in args.sample_files:
            print(f)
            if os.path.isfile(f):
                print(str(run_on_file_chunked(f)))
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`#!/usr/bin/env python3`

			`# Code to attempt to detect AT-generated text [relatively] quickly via compression ratios`
			`# (C) 2023 Thinkst Applied Research, PTY`
			`# Author: Jacob Torrey <jacob@thinkst.com>`

Improve performance by farming out chunks to other processes with multiprocessing Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-12 14:20:29 +00:00			`import lzma, argparse, os, itertools`
Added stdin reading for lzma_detect Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-06-06 16:34:39 +00:00			`import re, sys`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`from typing import List, Optional, Tuple`
Improve performance by farming out chunks to other processes with multiprocessing Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-12 14:20:29 +00:00			`from multiprocessing import Pool, cpu_count`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00
Clean up cleaning up documents prior to compressing Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-18 21:26:28 +00:00			`def clean_text(s : str) -> str:`
			`'''`
			`Removes formatting and other non-content data that may skew compression ratios (e.g., duplicate spaces)`
			`'''`
			`# Remove extra spaces and duplicate newlines.`
			`s = re.sub(' +', ' ', s)`
			`s = re.sub('\t', '', s)`
			`s = re.sub('\n+', '\n', s)`
			`s = re.sub('\n ', '\n', s)`
			`s = re.sub(' \n', '\n', s)`

			`# Remove non-alphanumeric chars`
Testing how much case impacts performance Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-24 21:25:52 +00:00			`s = re.sub('[^0-9A-Za-z,\.\(\) \n]', '', s)#.lower()`
Clean up cleaning up documents prior to compressing Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-18 21:26:28 +00:00
			`return s`

Change preset and behavior for very short samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-31 19:15:13 +00:00			`def to_english(s : str) -> str:`
			`'''`
			`Remove non-English (or names) from an input`
			`'''`

			`return s`

Added command-line parser/runner and samples to test with Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:36:43 +00:00			`# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`PRELUDE_FILE : str = 'ai-generated.txt'`
Strip whitespace, add to ai-generated, and improve tuning parameters Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-11 20:26:59 +00:00			`with open(PRELUDE_FILE, 'r') as fp:`
Clean up cleaning up documents prior to compressing Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-18 21:26:28 +00:00			`PRELUDE_STR = clean_text(fp.read())`
Add GPT-3 samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-18 15:52:59 +00:00
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`class LzmaLlmDetector:`
			`'''Class providing functionality to attempt to detect LLM/generative AI generated text using the LZMA compression algorithm'''`
Improve performance by farming out chunks to other processes with multiprocessing Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-12 14:20:29 +00:00			`def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3, prelude_str : Optional[str] = None, prelude_ratio : Optional[float] = None) -> None:`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`'''Initializes a compression with the passed prelude file, and optionally the number of digits to round to compare prelude vs. sample compression'''`
Change preset and behavior for very short samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-31 19:15:13 +00:00			`self.PRESET : int = 1`
Strip whitespace, add to ai-generated, and improve tuning parameters Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-11 20:26:59 +00:00			`self.comp = lzma.LZMACompressor(preset=self.PRESET)`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`self.c_buf : List[bytes] = []`
			`self.in_bytes : int = 0`
Fix bug in finalize, and add one more generated abstract to ai-generated from samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-12 15:13:04 +00:00			`self.prelude_ratio : float = 0.0`
			`if prelude_ratio != None:`
			`self.prelude_ratio = prelude_ratio`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`self.FUZZINESS_THRESHOLD = fuzziness_digits`
Strip whitespace, add to ai-generated, and improve tuning parameters Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-11 20:26:59 +00:00			`self.SHORT_SAMPLE_THRESHOLD : int = 350 # What sample length is considered "short"`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00
			`if prelude_file != None:`
			`# Read it once to get the default compression ratio for the prelude`
			`with open(prelude_file, 'r') as fp:`
			`self._compress_str(fp.read())`
			`self.prelude_ratio = self._finalize()`
			`# Redo this to prime the compressor`
Strip whitespace, add to ai-generated, and improve tuning parameters Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-11 20:26:59 +00:00			`self.comp = lzma.LZMACompressor(preset=self.PRESET)`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`with open(prelude_file, 'r') as fp:`
			`self._compress_str(fp.read())`
Strip whitespace, add to ai-generated, and improve tuning parameters Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-11 20:26:59 +00:00
			`if prelude_str != None:`
Improve performance by farming out chunks to other processes with multiprocessing Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-12 14:20:29 +00:00			`if self.prelude_ratio == 0.0:`
			`self._compress_str(prelude_str)`
			`self.prelude_ratio = self._finalize()`
			`self.comp = lzma.LZMACompressor(preset=self.PRESET)`
Strip whitespace, add to ai-generated, and improve tuning parameters Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-11 20:26:59 +00:00			`self._compress_str(prelude_str)`
Fix bug in finalize, and add one more generated abstract to ai-generated from samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-12 15:13:04 +00:00
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`def _compress_str(self, s : str) -> None:`
			`'''`
			`Internal helper function to compress a string`
			`'''`
Fix bug in finalize, and add one more generated abstract to ai-generated from samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-12 15:13:04 +00:00			`strb : bytes = s.encode('ascii', errors='ignore')`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`self.c_buf.append(self.comp.compress(strb))`
			`self.in_bytes += len(strb)`

			`def _finalize(self) -> float:`
			`'''`
			`Finalizes an LZMA compression cycle and returns the percentage compression ratio`

			`post: _ >= 0`
			`'''`
			`self.c_buf.append(self.comp.flush())`
			`compressed_size : int = len(b''.join(self.c_buf))`
			`if self.in_bytes == 0:`
			`return 0.0`
Fix bug in finalize, and add one more generated abstract to ai-generated from samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-12 15:13:04 +00:00			`score = compressed_size / self.in_bytes`
			`self.in_bytes = 0`
			`self.c_buf = []`
			`return score`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00
			`def get_compression_ratio(self, s : str) -> Tuple[float, float]:`
			`'''`
			`Returns a tuple of floats with the compression ratio of the prelude (0 if no prelude) and passed string`
			`'''`
			`self._compress_str(s)`
			`return (self.prelude_ratio, self._finalize())`

			`def score_text(self, sample : str) -> Optional[Tuple[str, float]]:`
			`'''`
			`Returns a tuple of a string (AI or Human) and a float confidence (higher is more confident) that the sample was generated`
			`by either an AI or human. Returns None if it cannot make a determination`
			`'''`
			`if self.prelude_ratio == 0.0:`
			`return None`
			`(prelude_score, sample_score) = self.get_compression_ratio(sample)`
Added command-line parser/runner and samples to test with Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:36:43 +00:00			`#print(str((prelude_score, sample_score)))`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00			`delta = prelude_score - sample_score`
			`determination = 'AI'`
Strip whitespace, add to ai-generated, and improve tuning parameters Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-11 20:26:59 +00:00			`if delta < 0:`
			`determination = 'Human'`

			`# If the sample doesn't 'move the needle', it's very close`
Change preset and behavior for very short samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-31 19:15:13 +00:00			`# if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) >= self.SHORT_SAMPLE_THRESHOLD:`
			`# #print('Sample len to default to AI: ' + str(len(sample)))`
			`# determination = 'AI'`
			`# if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) < self.SHORT_SAMPLE_THRESHOLD:`
			`# #print('Sample len to default to Human: ' + str(len(sample)))`
			`# determination = 'Human'`
Added support for chunking the input to prevent a very large sample from overwhelming the compression impact of the prelude file Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 22:42:38 +00:00			`#if abs(delta * 100) < .1 and determination == 'AI':`
			`# print("Very low-confidence determination of: " + determination)`
Change preset and behavior for very short samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-31 19:15:13 +00:00			`return (determination, abs(delta * 100))`
Initial commit of prototype LLM detector Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 16:56:02 +00:00
Improve command-line calling and test suite Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 20:32:43 +00:00			`def run_on_file(filename : str, fuzziness : int = 3) -> Optional[Tuple[str, float]]:`
Added support for chunking the input to prevent a very large sample from overwhelming the compression impact of the prelude file Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 22:42:38 +00:00			`'''Given a filename (and an optional number of decimal places to round to) returns the score for the contents of that file'''`
Added testing framework to determine what samples fail Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:46:59 +00:00			`with open(filename, 'r') as fp:`
Improve command-line calling and test suite Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 20:32:43 +00:00			`l = LzmaLlmDetector(PRELUDE_FILE, fuzziness)`
Added support for chunking the input to prevent a very large sample from overwhelming the compression impact of the prelude file Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 22:42:38 +00:00			`txt = fp.read()`
			`#print('Calculating score for input of length ' + str(len(txt)))`
			`return l.score_text(txt)`

Improve performance by farming out chunks to other processes with multiprocessing Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-12 14:20:29 +00:00			`def _score_chunk(c : str, fuzziness : int = 3, prelude_ratio : Optional[float] = None) -> Tuple[str, float]:`
			`l = LzmaLlmDetector(fuzziness_digits=fuzziness, prelude_str=PRELUDE_STR, prelude_ratio=prelude_ratio)`
			`return l.score_text(c)`

Change preset and behavior for very short samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-31 19:15:13 +00:00			`def run_on_file_chunked(filename : str, chunk_size : int = 1500, fuzziness : int = 3, prelude_ratio : Optional[float] = None) -> Optional[Tuple[str, float]]:`
Added support for chunking the input to prevent a very large sample from overwhelming the compression impact of the prelude file Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 22:42:38 +00:00			`'''`
			`Given a filename (and an optional chunk size and number of decimal places to round to) returns the score for the contents of that file.`
			`This function chunks the file into at most chunk_size parts to score separately, then returns an average. This prevents a very large input`
			`being skewed because its compression ratio starts to overwhelm the prelude file.`
			`'''`
			`with open(filename, 'r') as fp:`
			`contents = fp.read()`
Add GPT-3 samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-18 15:52:59 +00:00			`return run_on_text_chunked(contents, chunk_size, fuzziness, prelude_ratio)`

Change preset and behavior for very short samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-31 19:15:13 +00:00			`def run_on_text_chunked(s : str, chunk_size : int = 1500, fuzziness : int = 3, prelude_ratio : Optional[float] = None) -> Optional[Tuple[str, float]]:`
Add GPT-3 samples Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-18 15:52:59 +00:00			`'''`
			`Given a string (and an optional chunk size and number of decimal places to round to) returns the score for the passed string.`
			`This function chunks the input into at most chunk_size parts to score separately, then returns an average. This prevents a very large input`
			`being skewed because its compression ratio starts to overwhelm the prelude file.`
			`'''`
Clean up cleaning up documents prior to compressing Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-18 21:26:28 +00:00			`contents = clean_text(s)`
Strip whitespace, add to ai-generated, and improve tuning parameters Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-11 20:26:59 +00:00
Added support for chunking the input to prevent a very large sample from overwhelming the compression impact of the prelude file Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 22:42:38 +00:00			`start = 0`
			`end = 0`
			`chunks = []`
			`while start + chunk_size < len(contents) and end != -1:`
			`end = contents.rfind(' ', start, start + chunk_size + 1)`
			`chunks.append(contents[start:end])`
			`start = end + 1`
			`chunks.append(contents[start:])`
			`scores = []`
Improve performance by farming out chunks to other processes with multiprocessing Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-12 14:20:29 +00:00			`if len(chunks) > 2:`
			`with Pool(cpu_count()) as pool:`
			`for r in pool.starmap(_score_chunk, zip(chunks, itertools.repeat(fuzziness), itertools.repeat(prelude_ratio))):`
			`scores.append(r)`
			`else:`
			`for c in chunks:`
			`scores.append(_score_chunk(c, fuzziness=fuzziness, prelude_ratio=prelude_ratio))`
Added support for chunking the input to prevent a very large sample from overwhelming the compression impact of the prelude file Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 22:42:38 +00:00			`ssum : float = 0.0`
Add a weighted averaging to the chunking Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-24 20:56:54 +00:00			`for i, s in enumerate(scores):`
Added support for chunking the input to prevent a very large sample from overwhelming the compression impact of the prelude file Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 22:42:38 +00:00			`if s[0] == 'AI':`
Add a weighted averaging to the chunking Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-24 20:56:54 +00:00			`ssum -= s[1] * (len(chunks[i]) / len(contents))`
Added support for chunking the input to prevent a very large sample from overwhelming the compression impact of the prelude file Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 22:42:38 +00:00			`else:`
Add a weighted averaging to the chunking Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-24 20:56:54 +00:00			`ssum += s[1] * (len(chunks[i]) / len(contents))`
			`sa : float = ssum# / len(scores)`
Added support for chunking the input to prevent a very large sample from overwhelming the compression impact of the prelude file Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 22:42:38 +00:00			`if sa < 0:`
			`return ('AI', abs(sa))`
			`else:`
			`return ('Human', abs(sa))`

Added testing framework to determine what samples fail Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:46:59 +00:00
Added command-line parser/runner and samples to test with Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:36:43 +00:00			`if __name__ == '__main__':`
			`parser = argparse.ArgumentParser()`
Added stdin reading for lzma_detect Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-06-06 16:34:39 +00:00			`group = parser.add_mutually_exclusive_group()`
			`group.add_argument("-s", help='Read from stdin until EOF is reached instead of from a file', required=False, action='store_true')`
			`group.add_argument("sample_files", nargs='*', help='Text file(s) containing the sample to classify', default="")`
Added command-line parser/runner and samples to test with Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-05-10 17:36:43 +00:00			`args = parser.parse_args()`
Added stdin reading for lzma_detect Signed-off-by: Jacob Torrey <jacob@thinkst.com> 2023-06-06 16:34:39 +00:00			`if args.s:`
			`print(str(run_on_text_chunked(''.join(list(sys.stdin)))))`
			`else:`
			`for f in args.sample_files:`
			`print(f)`
			`if os.path.isfile(f):`
			`print(str(run_on_file_chunked(f)))`