Added the Originality.AI benchmark to samples and test_zippy_detect.py

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
2025-02-27 13:16:35 -07:00 · 2025-02-27 13:16:35 -07:00 · 0ecd771a4d
commit 0ecd771a4d
--- a/ai_detect_roc.png
+++ b/ai_detect_roc.png
--- a/samples/originality-2000_samples_benchmark.csv
+++ b/samples/originality-2000_samples_benchmark.csv
--- a/test_results/zippy-lzma-report.xml
+++ b/test_results/zippy-lzma-report.xml
--- a/test_zippy_detect.py
+++ b/test_zippy_detect.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3

-import pytest, os, jsonlines, csv
+import pytest, os, jsonlines, csv, random
 from warnings import warn
 from zippy.zippy import Zippy, EnsembledZippy, PRELUDE_STR, LzmaLlmDetector, BrotliLlmDetector, ZlibLlmDetector, CompressionEngine
 import zippy.zippy
@ -14,6 +14,8 @@ NUM_JSONL_SAMPLES = 500
 ai_files = os.listdir(AI_SAMPLE_DIR)
 human_files = os.listdir(HUMAN_SAMPLE_DIR)

+random.seed('ZIPPY TESTING SEED') # Allow for randomizing the test sets, but with a fixed seen for repeatability
+
 CONFIDENCE_THRESHOLD : float = 0.00 # What confidence to treat as error vs warning

 # Bool on whether to ensemble the models or run a single model
@ -204,3 +206,24 @@ def test_gptzero_eval_dataset_ai(i, record_property):
    (classification, score) = zippy.run_on_text_chunked(i.get('Document', ''), prelude_ratio=PRELUDE_RATIO)
    record_property("score", str(score))
    assert classification == i.get('Label'), GPTZERO_EVAL_FILE + ':' + str(i.get('Index')) + ' was misclassified with confidence ' + str(round(score, 8))
+
+ORIGINALITY_EVAL_FILE = 'samples/originality-2000_samples_benchmark.csv'
+originality_samples = []
+with open(ORIGINALITY_EVAL_FILE) as fp:
+    csvr = csv.DictReader(fp)
+
+    for obj in csvr:
+        if len(obj.get('text', '')) >= MIN_LEN:
+            originality_samples.append(obj)
+
+@pytest.mark.parametrize('i', random.sample(list(filter(lambda x: x.get('label') == 'human-written', originality_samples)), NUM_JSONL_SAMPLES))
+def test_originality_eval_dataset_human(i, record_property):
+    (classification, score) = zippy.run_on_text_chunked(i.get('text', ''), prelude_ratio=PRELUDE_RATIO)
+    record_property("score", str(score))
+    assert classification == 'Human', ORIGINALITY_EVAL_FILE + ':' + str(i.get('dataset')) + ' was misclassified with confidence ' + str(round(score, 8))
+
+@pytest.mark.parametrize('i', random.sample(list(filter(lambda x: x.get('label') == 'ai-generated' and x.get('dataset') != 'paraphrase', originality_samples)), NUM_JSONL_SAMPLES))
+def test_originality_eval_dataset_ai(i, record_property):
+    (classification, score) = zippy.run_on_text_chunked(i.get('text', ''), prelude_ratio=PRELUDE_RATIO)
+    record_property("score", str(score))
+    assert classification == 'AI', ORIGINALITY_EVAL_FILE + ':' + str(i.get('dataset')) + ' was misclassified with confidence ' + str(round(score, 8))