Completed evaluation of contentatscale.ai and added zlib support to both the Python and Nim/JS implementations

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
2023-09-26 07:51:41 -06:00 · 2023-09-26 07:51:41 -06:00 · 1e3ae4e9aa
commit 1e3ae4e9aa
--- a/ai_detect_roc.png
+++ b/ai_detect_roc.png
--- a/contentatscale-report.xml
+++ b/contentatscale-report.xml
--- a/nlzmadetect/src/nlzmadetect.nim
+++ b/nlzmadetect/src/nlzmadetect.nim
@ -203,5 +203,3 @@ when defined(js) and isMainModule:
      if opacity < 0.0:
        opacity = 0.0
      return opacity
-
-  #window.onload = on_load
--- a/plot_rocs.py
+++ b/plot_rocs.py
@ -6,7 +6,7 @@ from sklearn.metrics import roc_curve, auc
 import re
 from junitparser import JUnitXml

-MODELS = ['zippy', 'roberta', 'gptzero', 'crossplag', 'contentatscale']
+MODELS = ['zippy-lzma', 'zippy-zlib', 'roberta', 'gptzero', 'crossplag', 'contentatscale']
 SKIPCASES = ['gpt2', 'gpt3']

 MAX_PER_CASE = 500
--- a/test_contentatscale_detect.py
+++ b/test_contentatscale_detect.py
@ -8,7 +8,7 @@ AI_SAMPLE_DIR = 'samples/llm-generated/'
 HUMAN_SAMPLE_DIR = 'samples/human-generated/'

 MIN_LEN = 150
-NUM_JSONL_SAMPLES = 15#500
+NUM_JSONL_SAMPLES = 500

 ai_files = os.listdir(AI_SAMPLE_DIR)
 human_files = os.listdir(HUMAN_SAMPLE_DIR)
@ -68,37 +68,37 @@ def test_human_jsonl(i, record_property):
    record_property("score", str(score))
    assert classification == 'Human', HUMAN_JSONL_FILE + ':' + str(i.get('id')) + ' (len: ' + str(i.get('length', -1)) + ') is a human-generated sample, misclassified as AI-generated with confidence ' + str(round(score, 8))

-AI_JSONL_FILE = 'samples/xl-1542M.test.jsonl'
-ai_samples = []
-with jsonlines.open(AI_JSONL_FILE) as reader:
-    for obj in reader:
-        ai_samples.append(obj)
+# AI_JSONL_FILE = 'samples/xl-1542M.test.jsonl'
+# ai_samples = []
+# with jsonlines.open(AI_JSONL_FILE) as reader:
+#     for obj in reader:
+#         ai_samples.append(obj)

-@pytest.mark.parametrize('i', ai_samples[0:NUM_JSONL_SAMPLES])
-def test_llm_jsonl(i, record_property):
-    res = run_on_text_chunked(i.get('text', ''))
-    if res is None:
-        pytest.skip('Unable to classify')
-    (classification, score) = res
-    record_property("score", str(score))
-    assert classification == 'AI', AI_JSONL_FILE + ':' + str(i.get('id')) + ' (text: ' + i.get('text', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
+# @pytest.mark.parametrize('i', ai_samples[0:NUM_JSONL_SAMPLES])
+# def test_gpt2_jsonl(i, record_property):
+#     res = run_on_text_chunked(i.get('text', ''))
+#     if res is None:
+#         pytest.skip('Unable to classify')
+#     (classification, score) = res
+#     record_property("score", str(score))
+#     assert classification == 'AI', AI_JSONL_FILE + ':' + str(i.get('id')) + ' (text: ' + i.get('text', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))

-GPT3_JSONL_FILE = 'samples/GPT-3-175b_samples.jsonl'
-gpt3_samples = []
-with jsonlines.open(GPT3_JSONL_FILE) as reader:
-    for o in reader:
-        for l in o.split('<|endoftext|>'):
-            if len(l) >= MIN_LEN:
-                gpt3_samples.append(l)
+# GPT3_JSONL_FILE = 'samples/GPT-3-175b_samples.jsonl'
+# gpt3_samples = []
+# with jsonlines.open(GPT3_JSONL_FILE) as reader:
+#     for o in reader:
+#         for l in o.split('<|endoftext|>'):
+#             if len(l) >= MIN_LEN:
+#                 gpt3_samples.append(l)

-@pytest.mark.parametrize('i', gpt3_samples[0:NUM_JSONL_SAMPLES])
-def test_gpt3_jsonl(i, record_property):
-    res = run_on_text_chunked(i)
-    if res is None:
-        pytest.skip('Unable to classify')
-    (classification, score) = res
-    record_property("score", str(score))
-    assert classification == 'AI', GPT3_JSONL_FILE + ' is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
+# @pytest.mark.parametrize('i', gpt3_samples[0:NUM_JSONL_SAMPLES])
+# def test_gpt3_jsonl(i, record_property):
+#     res = run_on_text_chunked(i)
+#     if res is None:
+#         pytest.skip('Unable to classify')
+#     (classification, score) = res
+#     record_property("score", str(score))
+#     assert classification == 'AI', GPT3_JSONL_FILE[0:250] + ' is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))

 NEWS_JSONL_FILE = 'samples/news.jsonl'
 news_samples = []
--- a/test_zippy_detect.py
+++ b/test_zippy_detect.py
@ -2,21 +2,24 @@

 import pytest, os, jsonlines, csv
 from warnings import warn
-from zippy import run_on_file_chunked, run_on_text_chunked, PRELUDE_STR, LzmaLlmDetector
+from zippy import run_on_file_chunked, run_on_text_chunked, PRELUDE_STR, LzmaLlmDetector, CompressionEngine, ZlibLlmDetector, ENGINE
+import zippy

 AI_SAMPLE_DIR = 'samples/llm-generated/'
 HUMAN_SAMPLE_DIR = 'samples/human-generated/'

-MIN_LEN = 50
+MIN_LEN = 150
 NUM_JSONL_SAMPLES = 500

 ai_files = os.listdir(AI_SAMPLE_DIR)
 human_files = os.listdir(HUMAN_SAMPLE_DIR)

-FUZZINESS = 3
 CONFIDENCE_THRESHOLD : float = 0.00 # What confidence to treat as error vs warning

-PRELUDE_RATIO = LzmaLlmDetector(prelude_str=PRELUDE_STR).prelude_ratio
+if ENGINE == CompressionEngine.LZMA:
+    PRELUDE_RATIO = LzmaLlmDetector(prelude_str=PRELUDE_STR).prelude_ratio
+elif ENGINE == CompressionEngine.ZLIB:
+    PRELUDE_RATIO = ZlibLlmDetector(prelude_str=PRELUDE_STR).prelude_ratio

 def test_training_file(record_property):
    (classification, score) = run_on_file_chunked('ai-generated.txt')
@ -25,7 +28,7 @@ def test_training_file(record_property):

@pytest.mark.parametrize('f', human_files)
 def test_human_samples(f, record_property):
-    (classification, score) = run_on_file_chunked(HUMAN_SAMPLE_DIR + f, fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
+    (classification, score) = run_on_file_chunked(HUMAN_SAMPLE_DIR + f, prelude_ratio=PRELUDE_RATIO)
    record_property("score", str(score))
    if score > CONFIDENCE_THRESHOLD:
        assert classification == 'Human', f + ' is a human-generated file, misclassified as AI-generated with confidence ' + str(round(score, 8))
@ -37,7 +40,7 @@ def test_human_samples(f, record_property):

@pytest.mark.parametrize('f', ai_files)
 def test_llm_sample(f, record_property):
-   (classification, score) = run_on_file_chunked(AI_SAMPLE_DIR + f, fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
+   (classification, score) = run_on_file_chunked(AI_SAMPLE_DIR + f, prelude_ratio=PRELUDE_RATIO)
   record_property("score", str(score))
   if score > CONFIDENCE_THRESHOLD:
       assert classification == 'AI', f + ' is an LLM-generated file, misclassified as human-generated with confidence ' + str(round(score, 8))
@ -56,36 +59,36 @@ with jsonlines.open(HUMAN_JSONL_FILE) as reader:

@pytest.mark.parametrize('i', human_samples[0:NUM_JSONL_SAMPLES])
 def test_human_jsonl(i, record_property):
-    (classification, score) = run_on_text_chunked(i.get('text', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
+    (classification, score) = run_on_text_chunked(i.get('text', ''), prelude_ratio=PRELUDE_RATIO)
    record_property("score", str(score))
    assert classification == 'Human', HUMAN_JSONL_FILE + ':' + str(i.get('id')) + ' (len: ' + str(i.get('length', -1)) + ') is a human-generated sample, misclassified as AI-generated with confidence ' + str(round(score, 8))

-AI_JSONL_FILE = 'samples/xl-1542M.test.jsonl'
-ai_samples = []
-with jsonlines.open(AI_JSONL_FILE) as reader:
-    for obj in reader:
-        if obj.get('length', 0) >= MIN_LEN:
-            ai_samples.append(obj)
+# AI_JSONL_FILE = 'samples/xl-1542M.test.jsonl'
+# ai_samples = []
+# with jsonlines.open(AI_JSONL_FILE) as reader:
+#     for obj in reader:
+#         if obj.get('length', 0) >= MIN_LEN:
+#             ai_samples.append(obj)

-@pytest.mark.parametrize('i', ai_samples[0:NUM_JSONL_SAMPLES])
-def test_gpt2_jsonl(i, record_property):
-    (classification, score) = run_on_text_chunked(i.get('text', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
-    record_property("score", str(score))
-    assert classification == 'AI', AI_JSONL_FILE + ':' + str(i.get('id')) + ' (text: ' + i.get('text', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
+# @pytest.mark.parametrize('i', ai_samples[0:NUM_JSONL_SAMPLES])
+# def test_gpt2_jsonl(i, record_property):
+#     (classification, score) = run_on_text_chunked(i.get('text', ''), prelude_ratio=PRELUDE_RATIO)
+#     record_property("score", str(score))
+#     assert classification == 'AI', AI_JSONL_FILE + ':' + str(i.get('id')) + ' (text: ' + i.get('text', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))

-GPT3_JSONL_FILE = 'samples/GPT-3-175b_samples.jsonl'
-gpt3_samples = []
-with jsonlines.open(GPT3_JSONL_FILE) as reader:
-    for o in reader:
-        for l in o.split('<|endoftext|>'):
-            if len(l) >= MIN_LEN:
-                gpt3_samples.append(l)
+# GPT3_JSONL_FILE = 'samples/GPT-3-175b_samples.jsonl'
+# gpt3_samples = []
+# with jsonlines.open(GPT3_JSONL_FILE) as reader:
+#     for o in reader:
+#         for l in o.split('<|endoftext|>'):
+#             if len(l) >= MIN_LEN:
+#                 gpt3_samples.append(l)

-@pytest.mark.parametrize('i', gpt3_samples[0:NUM_JSONL_SAMPLES])
-def test_gpt3_jsonl(i, record_property):
-    (classification, score) = run_on_text_chunked(i, fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
-    record_property("score", str(score))
-    assert classification == 'AI', GPT3_JSONL_FILE + ' is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
+# @pytest.mark.parametrize('i', gpt3_samples[0:NUM_JSONL_SAMPLES])
+# def test_gpt3_jsonl(i, record_property):
+#     (classification, score) = run_on_text_chunked(i, prelude_ratio=PRELUDE_RATIO)
+#     record_property("score", str(score))
+#     assert classification == 'AI', GPT3_JSONL_FILE + ' is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))

 NEWS_JSONL_FILE = 'samples/news.jsonl'
 news_samples = []
@ -95,13 +98,13 @@ with jsonlines.open(NEWS_JSONL_FILE) as reader:

@pytest.mark.parametrize('i', news_samples[0:NUM_JSONL_SAMPLES])
 def test_humannews_jsonl(i, record_property):
-    (classification, score) = run_on_text_chunked(i.get('human', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
+    (classification, score) = run_on_text_chunked(i.get('human', ''), prelude_ratio=PRELUDE_RATIO)
    record_property("score", str(score))
    assert classification == 'Human', NEWS_JSONL_FILE + ' is a human-generated sample, misclassified as AI-generated with confidence ' + str(round(score, 8))

@pytest.mark.parametrize('i', news_samples[0:NUM_JSONL_SAMPLES])
 def test_chatgptnews_jsonl(i, record_property):
-    (classification, score) = run_on_text_chunked(i.get('chatgpt', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
+    (classification, score) = run_on_text_chunked(i.get('chatgpt', ''), prelude_ratio=PRELUDE_RATIO)
    record_property("score", str(score))
    assert classification == 'AI', NEWS_JSONL_FILE + ' is a AI-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))

@ -114,7 +117,7 @@ with jsonlines.open(CHEAT_HUMAN_JSONL_FILE) as reader:

@pytest.mark.parametrize('i', ch_samples[0:NUM_JSONL_SAMPLES])
 def test_cheat_human_jsonl(i, record_property):
-    (classification, score) = run_on_text_chunked(i.get('abstract', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
+    (classification, score) = run_on_text_chunked(i.get('abstract', ''), prelude_ratio=PRELUDE_RATIO)
    record_property("score", str(score))
    assert classification == 'Human', CHEAT_HUMAN_JSONL_FILE + ':' + str(i.get('id')) + ' [' + str(len(i.get('abstract', ''))) + '] (title: ' + i.get('title', "").replace('\n', ' ')[:15] + ') is a human-generated sample, misclassified as AI-generated with confidence ' + str(round(score, 8))

@ -127,7 +130,7 @@ with jsonlines.open(CHEAT_GEN_JSONL_FILE) as reader:

@pytest.mark.parametrize('i', cg_samples[0:NUM_JSONL_SAMPLES])
 def test_cheat_generation_jsonl(i, record_property):
-    (classification, score) = run_on_text_chunked(i.get('abstract', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
+    (classification, score) = run_on_text_chunked(i.get('abstract', ''), prelude_ratio=PRELUDE_RATIO)
    record_property("score", str(score))
    assert classification == 'AI', CHEAT_GEN_JSONL_FILE + ':' + str(i.get('id')) + ' (title: ' + i.get('title', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))

@ -140,7 +143,7 @@ with jsonlines.open(CHEAT_POLISH_JSONL_FILE) as reader:

@pytest.mark.parametrize('i', cp_samples[0:NUM_JSONL_SAMPLES])
 def test_cheat_polish_jsonl(i, record_property):
-    (classification, score) = run_on_text_chunked(i.get('abstract', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
+    (classification, score) = run_on_text_chunked(i.get('abstract', ''), prelude_ratio=PRELUDE_RATIO)
    record_property("score", str(score))
    assert classification == 'AI', CHEAT_POLISH_JSONL_FILE + ':' + str(i.get('id')) + ' (title: ' + i.get('title', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))

@ -153,7 +156,7 @@ with jsonlines.open(CHEAT_VICUNAGEN_JSONL_FILE) as reader:

@pytest.mark.parametrize('i', vg_samples[0:NUM_JSONL_SAMPLES])
 def test_vicuna_generation_jsonl(i, record_property):
-    (classification, score) = run_on_text_chunked(i.get('abstract', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
+    (classification, score) = run_on_text_chunked(i.get('abstract', ''), prelude_ratio=PRELUDE_RATIO)
    record_property("score", str(score))
    assert classification == 'AI', CHEAT_VICUNAGEN_JSONL_FILE + ':' + str(i.get('id')) + ' (title: ' + i.get('title', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))

@ -167,12 +170,12 @@ with open(GPTZERO_EVAL_FILE) as fp:

@pytest.mark.parametrize('i', list(filter(lambda x: x.get('Label') == 'Human', ge_samples[0:NUM_JSONL_SAMPLES])))
 def test_gptzero_eval_dataset_human(i, record_property):
-    (classification, score) = run_on_text_chunked(i.get('Document', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
+    (classification, score) = run_on_text_chunked(i.get('Document', ''), prelude_ratio=PRELUDE_RATIO)
    record_property("score", str(score))
    assert classification == i.get('Label'), GPTZERO_EVAL_FILE + ':' + str(i.get('Index')) + ' was misclassified with confidence ' + str(round(score, 8))

@pytest.mark.parametrize('i', list(filter(lambda x: x.get('Label') == 'AI', ge_samples[0:NUM_JSONL_SAMPLES])))
 def test_gptzero_eval_dataset_ai(i, record_property):
-    (classification, score) = run_on_text_chunked(i.get('Document', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
+    (classification, score) = run_on_text_chunked(i.get('Document', ''), prelude_ratio=PRELUDE_RATIO)
    record_property("score", str(score))
    assert classification == i.get('Label'), GPTZERO_EVAL_FILE + ':' + str(i.get('Index')) + ' was misclassified with confidence ' + str(round(score, 8))
--- a/zippy.py
+++ b/zippy.py
@ -5,12 +5,21 @@
 # Author: Jacob Torrey <jacob@thinkst.com>

 import lzma, argparse, os, itertools
+from zlib import compressobj, Z_FINISH
 import re, sys
+from abc import ABC, abstractmethod
+from enum import Enum
 from typing import List, Optional, Tuple, TypeAlias
 from multiprocessing import Pool, cpu_count

 Score : TypeAlias = tuple[str, float]

+class CompressionEngine(Enum):
+    LZMA = 1
+    ZLIB = 2
+
+ENGINE : CompressionEngine = CompressionEngine.ZLIB
+
 def clean_text(s : str) -> str:
    '''
    Removes formatting and other non-content data that may skew compression ratios (e.g., duplicate spaces)
@ -32,9 +41,61 @@ PRELUDE_FILE : str = 'ai-generated.txt'
 with open(PRELUDE_FILE, 'r') as fp:
    PRELUDE_STR = clean_text(fp.read())

-class LzmaLlmDetector:
+class AIDetector(ABC):
+    '''
+    Base class for AI detection
+    '''
+    @abstractmethod
+    def score_text(self, sample : str) -> Optional[Score]:
+        pass
+
+class ZlibLlmDetector(AIDetector):
+    '''Class providing functionality to attempt to detect LLM/generative AI generated text using the zlib compression algorithm'''
+    def __init__(self, prelude_file : Optional[str] = None, prelude_str : Optional[str] = None, prelude_ratio : Optional[float] = None):
+        self.PRESET = 9
+        self.WBITS = -15
+        self.prelude_ratio = 0.0
+        if prelude_ratio != None:
+            self.prelude_ratio = prelude_ratio
+        
+        if prelude_file != None:
+            with open(prelude_file) as fp:
+                self.prelude_str = fp.read()
+                self.prelude_ratio = self._compress(self.prelude_str)
+    
+        if prelude_str != None:
+            self.prelude_str = prelude_str
+            self.prelude_ratio = self._compress(self.prelude_str)
+
+    def _compress(self, s : str) -> float:
+        orig_len = len(s.encode())
+        c = compressobj(level=self.PRESET, wbits=self.WBITS, memLevel=9)
+        bytes = c.compress(s.encode())
+        bytes += c.flush(Z_FINISH)
+        c_len = len(bytes)
+        #c_len = len(compress(s.encode(), level=self.PRESET, wbits=self.WBITS))
+        return c_len / orig_len
+    
+    def score_text(self, sample: str) -> Score | None:
+        '''
+        Returns a tuple of a string (AI or Human) and a float confidence (higher is more confident) that the sample was generated 
+        by either an AI or human. Returns None if it cannot make a determination
+        '''
+        if self.prelude_ratio == 0.0:
+            return None
+        sample_score = self._compress(self.prelude_str + sample)
+        #print(str((self.prelude_ratio, sample_score)))
+        delta = self.prelude_ratio - sample_score
+        determination = 'AI'
+        if delta < 0:
+            determination = 'Human'
+
+        return (determination, abs(delta * 100))
+
+
+class LzmaLlmDetector(AIDetector):
    '''Class providing functionality to attempt to detect LLM/generative AI generated text using the LZMA compression algorithm'''
-    def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3, prelude_str : Optional[str] = None, prelude_ratio : Optional[float] = None) -> None:
+    def __init__(self, prelude_file : Optional[str] = None, prelude_str : Optional[str] = None, prelude_ratio : Optional[float] = None) -> None:
        '''Initializes a compression with the passed prelude file, and optionally the number of digits to round to compare prelude vs. sample compression'''
        self.PRESET : int = 2
        self.comp = lzma.LZMACompressor(preset=self.PRESET)
@ -43,7 +104,6 @@ class LzmaLlmDetector:
        self.prelude_ratio : float = 0.0
        if prelude_ratio != None:
            self.prelude_ratio = prelude_ratio
-        self.FUZZINESS_THRESHOLD = fuzziness_digits
        self.SHORT_SAMPLE_THRESHOLD : int = 350 # What sample length is considered "short"

        if prelude_file != None:
@ -102,39 +162,39 @@ class LzmaLlmDetector:
        if self.prelude_ratio == 0.0:
            return None
        (prelude_score, sample_score) = self.get_compression_ratio(sample)
-        #print(str((prelude_score, sample_score)))
+        print(str((self.prelude_ratio, sample_score)))
        delta = prelude_score - sample_score
        determination = 'AI'
        if delta < 0:
            determination = 'Human'

-        # If the sample doesn't 'move the needle', it's very close
-        # if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) >= self.SHORT_SAMPLE_THRESHOLD:
-        #     #print('Sample len to default to AI: ' + str(len(sample)))
-        #     determination = 'AI'
-        # if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) < self.SHORT_SAMPLE_THRESHOLD:
-        #     #print('Sample len to default to Human: ' + str(len(sample)))
-        #     determination = 'Human'
-        #if abs(delta * 100) < .1 and determination == 'AI':
-        #    print("Very low-confidence determination of: " + determination)
        return (determination, abs(delta * 100))
        
-def run_on_file(filename : str, fuzziness : int = 3) -> Optional[Score]:
+def run_on_file(filename : str) -> Optional[Score]:
    '''Given a filename (and an optional number of decimal places to round to) returns the score for the contents of that file'''
    with open(filename, 'r') as fp:
-        l = LzmaLlmDetector(PRELUDE_FILE, fuzziness)
+        if ENGINE == CompressionEngine.LZMA:
+            l = LzmaLlmDetector(prelude_file=PRELUDE_FILE)
+        elif ENGINE == CompressionEngine.ZLIB:
+            l = ZlibLlmDetector(prelude_file=PRELUDE_FILE)
        txt = fp.read()
        #print('Calculating score for input of length ' + str(len(txt)))
        return l.score_text(txt)

-def _score_chunk(c : str, fuzziness : int = 3, prelude_file : Optional[str] = None, prelude_ratio : Optional[float] = None) -> Score:
+def _score_chunk(c : str, prelude_file : Optional[str] = None, prelude_ratio : Optional[float] = None) -> Score:
        if prelude_file != None:
-            l = LzmaLlmDetector(fuzziness_digits=fuzziness, prelude_file=prelude_file)
+            if ENGINE == CompressionEngine.LZMA:
+                l = LzmaLlmDetector(prelude_file=prelude_file)
+            if ENGINE == CompressionEngine.ZLIB:
+                l = ZlibLlmDetector(prelude_file=prelude_file)
        else:
-            l = LzmaLlmDetector(fuzziness_digits=fuzziness, prelude_str=PRELUDE_STR, prelude_ratio=prelude_ratio)
+            if ENGINE == CompressionEngine.LZMA:
+                l = LzmaLlmDetector(prelude_str=PRELUDE_STR, prelude_ratio=prelude_ratio)
+            if ENGINE == CompressionEngine.ZLIB:
+                l = ZlibLlmDetector(prelude_str=PRELUDE_STR, prelude_ratio=prelude_ratio)
        return l.score_text(c)

-def run_on_file_chunked(filename : str, chunk_size : int = 1500, fuzziness : int = 3, prelude_ratio : Optional[float] = None) -> Optional[Score]:
+def run_on_file_chunked(filename : str, chunk_size : int = 1500, prelude_ratio : Optional[float] = None) -> Optional[Score]:
    '''
    Given a filename (and an optional chunk size and number of decimal places to round to) returns the score for the contents of that file.
    This function chunks the file into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
@ -142,9 +202,9 @@ def run_on_file_chunked(filename : str, chunk_size : int = 1500, fuzziness : int
    '''
    with open(filename, 'r') as fp:
        contents = fp.read()
-    return run_on_text_chunked(contents, chunk_size, fuzziness=fuzziness, prelude_ratio=prelude_ratio)
+    return run_on_text_chunked(contents, chunk_size, prelude_ratio=prelude_ratio)

-def run_on_text_chunked(s : str, chunk_size : int = 1500, fuzziness : int = 3, prelude_file : Optional[str] = None, prelude_ratio : Optional[float] = None) -> Optional[Score]:
+def run_on_text_chunked(s : str, chunk_size : int = 1500, prelude_file : Optional[str] = None, prelude_ratio : Optional[float] = None) -> Optional[Score]:
    '''
    Given a string (and an optional chunk size and number of decimal places to round to) returns the score for the passed string.
    This function chunks the input into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
@ -163,11 +223,11 @@ def run_on_text_chunked(s : str, chunk_size : int = 1500, fuzziness : int = 3, p
    scores = []
    if len(chunks) > 2:
        with Pool(cpu_count()) as pool:
-            for r in pool.starmap(_score_chunk, zip(chunks, itertools.repeat(fuzziness), itertools.repeat(prelude_file), itertools.repeat(prelude_ratio))):
+            for r in pool.starmap(_score_chunk, zip(chunks, itertools.repeat(prelude_file), itertools.repeat(prelude_ratio))):
                scores.append(r)
    else:
        for c in chunks:
-            scores.append(_score_chunk(c, fuzziness=fuzziness, prelude_file=prelude_file, prelude_ratio=prelude_ratio))
+            scores.append(_score_chunk(c, prelude_file=prelude_file, prelude_ratio=prelude_ratio))
    ssum : float = 0.0
    for i, s in enumerate(scores):
        if s[0] == 'AI':
@ -183,10 +243,16 @@ def run_on_text_chunked(s : str, chunk_size : int = 1500, fuzziness : int = 3, p

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", choices=['zlib', 'lzma'], help='Which compression engine to use: lzma or zlib', default='lzma', required=False)
    group = parser.add_mutually_exclusive_group()
    group.add_argument("-s", help='Read from stdin until EOF is reached instead of from a file', required=False, action='store_true')
    group.add_argument("sample_files", nargs='*', help='Text file(s) containing the sample to classify', default="")
    args = parser.parse_args()
+    if args.e:
+        if args.e == 'lzma':
+            ENGINE = CompressionEngine.LZMA
+        elif args.e == 'zlib':
+            ENGINE = CompressionEngine.ZLIB
    if args.s:
        print(str(run_on_text_chunked(''.join(list(sys.stdin)))))
    elif len(args.sample_files) == 0: