Completed evaluation of contentatscale.ai and added zlib support to both the Python and Nim/JS implementations

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
pull/6/head
Jacob Torrey 2023-09-26 07:51:41 -06:00
rodzic 81bdb8e5dd
commit 1e3ae4e9aa
7 zmienionych plików z 1223 dodań i 304 usunięć

Plik binarny nie jest wyświetlany.

Przed

Szerokość:  |  Wysokość:  |  Rozmiar: 78 KiB

Po

Szerokość:  |  Wysokość:  |  Rozmiar: 86 KiB

File diff suppressed because one or more lines are too long

Wyświetl plik

@ -203,5 +203,3 @@ when defined(js) and isMainModule:
if opacity < 0.0:
opacity = 0.0
return opacity
#window.onload = on_load

Wyświetl plik

@ -6,7 +6,7 @@ from sklearn.metrics import roc_curve, auc
import re
from junitparser import JUnitXml
MODELS = ['zippy', 'roberta', 'gptzero', 'crossplag', 'contentatscale']
MODELS = ['zippy-lzma', 'zippy-zlib', 'roberta', 'gptzero', 'crossplag', 'contentatscale']
SKIPCASES = ['gpt2', 'gpt3']
MAX_PER_CASE = 500

Wyświetl plik

@ -8,7 +8,7 @@ AI_SAMPLE_DIR = 'samples/llm-generated/'
HUMAN_SAMPLE_DIR = 'samples/human-generated/'
MIN_LEN = 150
NUM_JSONL_SAMPLES = 15#500
NUM_JSONL_SAMPLES = 500
ai_files = os.listdir(AI_SAMPLE_DIR)
human_files = os.listdir(HUMAN_SAMPLE_DIR)
@ -68,37 +68,37 @@ def test_human_jsonl(i, record_property):
record_property("score", str(score))
assert classification == 'Human', HUMAN_JSONL_FILE + ':' + str(i.get('id')) + ' (len: ' + str(i.get('length', -1)) + ') is a human-generated sample, misclassified as AI-generated with confidence ' + str(round(score, 8))
AI_JSONL_FILE = 'samples/xl-1542M.test.jsonl'
ai_samples = []
with jsonlines.open(AI_JSONL_FILE) as reader:
for obj in reader:
ai_samples.append(obj)
# AI_JSONL_FILE = 'samples/xl-1542M.test.jsonl'
# ai_samples = []
# with jsonlines.open(AI_JSONL_FILE) as reader:
# for obj in reader:
# ai_samples.append(obj)
@pytest.mark.parametrize('i', ai_samples[0:NUM_JSONL_SAMPLES])
def test_llm_jsonl(i, record_property):
res = run_on_text_chunked(i.get('text', ''))
if res is None:
pytest.skip('Unable to classify')
(classification, score) = res
record_property("score", str(score))
assert classification == 'AI', AI_JSONL_FILE + ':' + str(i.get('id')) + ' (text: ' + i.get('text', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
# @pytest.mark.parametrize('i', ai_samples[0:NUM_JSONL_SAMPLES])
# def test_gpt2_jsonl(i, record_property):
# res = run_on_text_chunked(i.get('text', ''))
# if res is None:
# pytest.skip('Unable to classify')
# (classification, score) = res
# record_property("score", str(score))
# assert classification == 'AI', AI_JSONL_FILE + ':' + str(i.get('id')) + ' (text: ' + i.get('text', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
GPT3_JSONL_FILE = 'samples/GPT-3-175b_samples.jsonl'
gpt3_samples = []
with jsonlines.open(GPT3_JSONL_FILE) as reader:
for o in reader:
for l in o.split('<|endoftext|>'):
if len(l) >= MIN_LEN:
gpt3_samples.append(l)
# GPT3_JSONL_FILE = 'samples/GPT-3-175b_samples.jsonl'
# gpt3_samples = []
# with jsonlines.open(GPT3_JSONL_FILE) as reader:
# for o in reader:
# for l in o.split('<|endoftext|>'):
# if len(l) >= MIN_LEN:
# gpt3_samples.append(l)
@pytest.mark.parametrize('i', gpt3_samples[0:NUM_JSONL_SAMPLES])
def test_gpt3_jsonl(i, record_property):
res = run_on_text_chunked(i)
if res is None:
pytest.skip('Unable to classify')
(classification, score) = res
record_property("score", str(score))
assert classification == 'AI', GPT3_JSONL_FILE + ' is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
# @pytest.mark.parametrize('i', gpt3_samples[0:NUM_JSONL_SAMPLES])
# def test_gpt3_jsonl(i, record_property):
# res = run_on_text_chunked(i)
# if res is None:
# pytest.skip('Unable to classify')
# (classification, score) = res
# record_property("score", str(score))
# assert classification == 'AI', GPT3_JSONL_FILE[0:250] + ' is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
NEWS_JSONL_FILE = 'samples/news.jsonl'
news_samples = []

Wyświetl plik

@ -2,21 +2,24 @@
import pytest, os, jsonlines, csv
from warnings import warn
from zippy import run_on_file_chunked, run_on_text_chunked, PRELUDE_STR, LzmaLlmDetector
from zippy import run_on_file_chunked, run_on_text_chunked, PRELUDE_STR, LzmaLlmDetector, CompressionEngine, ZlibLlmDetector, ENGINE
import zippy
AI_SAMPLE_DIR = 'samples/llm-generated/'
HUMAN_SAMPLE_DIR = 'samples/human-generated/'
MIN_LEN = 50
MIN_LEN = 150
NUM_JSONL_SAMPLES = 500
ai_files = os.listdir(AI_SAMPLE_DIR)
human_files = os.listdir(HUMAN_SAMPLE_DIR)
FUZZINESS = 3
CONFIDENCE_THRESHOLD : float = 0.00 # What confidence to treat as error vs warning
PRELUDE_RATIO = LzmaLlmDetector(prelude_str=PRELUDE_STR).prelude_ratio
if ENGINE == CompressionEngine.LZMA:
PRELUDE_RATIO = LzmaLlmDetector(prelude_str=PRELUDE_STR).prelude_ratio
elif ENGINE == CompressionEngine.ZLIB:
PRELUDE_RATIO = ZlibLlmDetector(prelude_str=PRELUDE_STR).prelude_ratio
def test_training_file(record_property):
(classification, score) = run_on_file_chunked('ai-generated.txt')
@ -25,7 +28,7 @@ def test_training_file(record_property):
@pytest.mark.parametrize('f', human_files)
def test_human_samples(f, record_property):
(classification, score) = run_on_file_chunked(HUMAN_SAMPLE_DIR + f, fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
(classification, score) = run_on_file_chunked(HUMAN_SAMPLE_DIR + f, prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
if score > CONFIDENCE_THRESHOLD:
assert classification == 'Human', f + ' is a human-generated file, misclassified as AI-generated with confidence ' + str(round(score, 8))
@ -37,7 +40,7 @@ def test_human_samples(f, record_property):
@pytest.mark.parametrize('f', ai_files)
def test_llm_sample(f, record_property):
(classification, score) = run_on_file_chunked(AI_SAMPLE_DIR + f, fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
(classification, score) = run_on_file_chunked(AI_SAMPLE_DIR + f, prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
if score > CONFIDENCE_THRESHOLD:
assert classification == 'AI', f + ' is an LLM-generated file, misclassified as human-generated with confidence ' + str(round(score, 8))
@ -56,36 +59,36 @@ with jsonlines.open(HUMAN_JSONL_FILE) as reader:
@pytest.mark.parametrize('i', human_samples[0:NUM_JSONL_SAMPLES])
def test_human_jsonl(i, record_property):
(classification, score) = run_on_text_chunked(i.get('text', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
(classification, score) = run_on_text_chunked(i.get('text', ''), prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
assert classification == 'Human', HUMAN_JSONL_FILE + ':' + str(i.get('id')) + ' (len: ' + str(i.get('length', -1)) + ') is a human-generated sample, misclassified as AI-generated with confidence ' + str(round(score, 8))
AI_JSONL_FILE = 'samples/xl-1542M.test.jsonl'
ai_samples = []
with jsonlines.open(AI_JSONL_FILE) as reader:
for obj in reader:
if obj.get('length', 0) >= MIN_LEN:
ai_samples.append(obj)
# AI_JSONL_FILE = 'samples/xl-1542M.test.jsonl'
# ai_samples = []
# with jsonlines.open(AI_JSONL_FILE) as reader:
# for obj in reader:
# if obj.get('length', 0) >= MIN_LEN:
# ai_samples.append(obj)
@pytest.mark.parametrize('i', ai_samples[0:NUM_JSONL_SAMPLES])
def test_gpt2_jsonl(i, record_property):
(classification, score) = run_on_text_chunked(i.get('text', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
assert classification == 'AI', AI_JSONL_FILE + ':' + str(i.get('id')) + ' (text: ' + i.get('text', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
# @pytest.mark.parametrize('i', ai_samples[0:NUM_JSONL_SAMPLES])
# def test_gpt2_jsonl(i, record_property):
# (classification, score) = run_on_text_chunked(i.get('text', ''), prelude_ratio=PRELUDE_RATIO)
# record_property("score", str(score))
# assert classification == 'AI', AI_JSONL_FILE + ':' + str(i.get('id')) + ' (text: ' + i.get('text', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
GPT3_JSONL_FILE = 'samples/GPT-3-175b_samples.jsonl'
gpt3_samples = []
with jsonlines.open(GPT3_JSONL_FILE) as reader:
for o in reader:
for l in o.split('<|endoftext|>'):
if len(l) >= MIN_LEN:
gpt3_samples.append(l)
# GPT3_JSONL_FILE = 'samples/GPT-3-175b_samples.jsonl'
# gpt3_samples = []
# with jsonlines.open(GPT3_JSONL_FILE) as reader:
# for o in reader:
# for l in o.split('<|endoftext|>'):
# if len(l) >= MIN_LEN:
# gpt3_samples.append(l)
@pytest.mark.parametrize('i', gpt3_samples[0:NUM_JSONL_SAMPLES])
def test_gpt3_jsonl(i, record_property):
(classification, score) = run_on_text_chunked(i, fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
assert classification == 'AI', GPT3_JSONL_FILE + ' is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
# @pytest.mark.parametrize('i', gpt3_samples[0:NUM_JSONL_SAMPLES])
# def test_gpt3_jsonl(i, record_property):
# (classification, score) = run_on_text_chunked(i, prelude_ratio=PRELUDE_RATIO)
# record_property("score", str(score))
# assert classification == 'AI', GPT3_JSONL_FILE + ' is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
NEWS_JSONL_FILE = 'samples/news.jsonl'
news_samples = []
@ -95,13 +98,13 @@ with jsonlines.open(NEWS_JSONL_FILE) as reader:
@pytest.mark.parametrize('i', news_samples[0:NUM_JSONL_SAMPLES])
def test_humannews_jsonl(i, record_property):
(classification, score) = run_on_text_chunked(i.get('human', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
(classification, score) = run_on_text_chunked(i.get('human', ''), prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
assert classification == 'Human', NEWS_JSONL_FILE + ' is a human-generated sample, misclassified as AI-generated with confidence ' + str(round(score, 8))
@pytest.mark.parametrize('i', news_samples[0:NUM_JSONL_SAMPLES])
def test_chatgptnews_jsonl(i, record_property):
(classification, score) = run_on_text_chunked(i.get('chatgpt', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
(classification, score) = run_on_text_chunked(i.get('chatgpt', ''), prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
assert classification == 'AI', NEWS_JSONL_FILE + ' is a AI-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
@ -114,7 +117,7 @@ with jsonlines.open(CHEAT_HUMAN_JSONL_FILE) as reader:
@pytest.mark.parametrize('i', ch_samples[0:NUM_JSONL_SAMPLES])
def test_cheat_human_jsonl(i, record_property):
(classification, score) = run_on_text_chunked(i.get('abstract', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
(classification, score) = run_on_text_chunked(i.get('abstract', ''), prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
assert classification == 'Human', CHEAT_HUMAN_JSONL_FILE + ':' + str(i.get('id')) + ' [' + str(len(i.get('abstract', ''))) + '] (title: ' + i.get('title', "").replace('\n', ' ')[:15] + ') is a human-generated sample, misclassified as AI-generated with confidence ' + str(round(score, 8))
@ -127,7 +130,7 @@ with jsonlines.open(CHEAT_GEN_JSONL_FILE) as reader:
@pytest.mark.parametrize('i', cg_samples[0:NUM_JSONL_SAMPLES])
def test_cheat_generation_jsonl(i, record_property):
(classification, score) = run_on_text_chunked(i.get('abstract', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
(classification, score) = run_on_text_chunked(i.get('abstract', ''), prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
assert classification == 'AI', CHEAT_GEN_JSONL_FILE + ':' + str(i.get('id')) + ' (title: ' + i.get('title', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
@ -140,7 +143,7 @@ with jsonlines.open(CHEAT_POLISH_JSONL_FILE) as reader:
@pytest.mark.parametrize('i', cp_samples[0:NUM_JSONL_SAMPLES])
def test_cheat_polish_jsonl(i, record_property):
(classification, score) = run_on_text_chunked(i.get('abstract', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
(classification, score) = run_on_text_chunked(i.get('abstract', ''), prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
assert classification == 'AI', CHEAT_POLISH_JSONL_FILE + ':' + str(i.get('id')) + ' (title: ' + i.get('title', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
@ -153,7 +156,7 @@ with jsonlines.open(CHEAT_VICUNAGEN_JSONL_FILE) as reader:
@pytest.mark.parametrize('i', vg_samples[0:NUM_JSONL_SAMPLES])
def test_vicuna_generation_jsonl(i, record_property):
(classification, score) = run_on_text_chunked(i.get('abstract', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
(classification, score) = run_on_text_chunked(i.get('abstract', ''), prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
assert classification == 'AI', CHEAT_VICUNAGEN_JSONL_FILE + ':' + str(i.get('id')) + ' (title: ' + i.get('title', "").replace('\n', ' ')[:50] + ') is an LLM-generated sample, misclassified as human-generated with confidence ' + str(round(score, 8))
@ -167,12 +170,12 @@ with open(GPTZERO_EVAL_FILE) as fp:
@pytest.mark.parametrize('i', list(filter(lambda x: x.get('Label') == 'Human', ge_samples[0:NUM_JSONL_SAMPLES])))
def test_gptzero_eval_dataset_human(i, record_property):
(classification, score) = run_on_text_chunked(i.get('Document', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
(classification, score) = run_on_text_chunked(i.get('Document', ''), prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
assert classification == i.get('Label'), GPTZERO_EVAL_FILE + ':' + str(i.get('Index')) + ' was misclassified with confidence ' + str(round(score, 8))
@pytest.mark.parametrize('i', list(filter(lambda x: x.get('Label') == 'AI', ge_samples[0:NUM_JSONL_SAMPLES])))
def test_gptzero_eval_dataset_ai(i, record_property):
(classification, score) = run_on_text_chunked(i.get('Document', ''), fuzziness=FUZZINESS, prelude_ratio=PRELUDE_RATIO)
(classification, score) = run_on_text_chunked(i.get('Document', ''), prelude_ratio=PRELUDE_RATIO)
record_property("score", str(score))
assert classification == i.get('Label'), GPTZERO_EVAL_FILE + ':' + str(i.get('Index')) + ' was misclassified with confidence ' + str(round(score, 8))

112
zippy.py
Wyświetl plik

@ -5,12 +5,21 @@
# Author: Jacob Torrey <jacob@thinkst.com>
import lzma, argparse, os, itertools
from zlib import compressobj, Z_FINISH
import re, sys
from abc import ABC, abstractmethod
from enum import Enum
from typing import List, Optional, Tuple, TypeAlias
from multiprocessing import Pool, cpu_count
Score : TypeAlias = tuple[str, float]
class CompressionEngine(Enum):
LZMA = 1
ZLIB = 2
ENGINE : CompressionEngine = CompressionEngine.ZLIB
def clean_text(s : str) -> str:
'''
Removes formatting and other non-content data that may skew compression ratios (e.g., duplicate spaces)
@ -32,9 +41,61 @@ PRELUDE_FILE : str = 'ai-generated.txt'
with open(PRELUDE_FILE, 'r') as fp:
PRELUDE_STR = clean_text(fp.read())
class LzmaLlmDetector:
class AIDetector(ABC):
'''
Base class for AI detection
'''
@abstractmethod
def score_text(self, sample : str) -> Optional[Score]:
pass
class ZlibLlmDetector(AIDetector):
'''Class providing functionality to attempt to detect LLM/generative AI generated text using the zlib compression algorithm'''
def __init__(self, prelude_file : Optional[str] = None, prelude_str : Optional[str] = None, prelude_ratio : Optional[float] = None):
self.PRESET = 9
self.WBITS = -15
self.prelude_ratio = 0.0
if prelude_ratio != None:
self.prelude_ratio = prelude_ratio
if prelude_file != None:
with open(prelude_file) as fp:
self.prelude_str = fp.read()
self.prelude_ratio = self._compress(self.prelude_str)
if prelude_str != None:
self.prelude_str = prelude_str
self.prelude_ratio = self._compress(self.prelude_str)
def _compress(self, s : str) -> float:
orig_len = len(s.encode())
c = compressobj(level=self.PRESET, wbits=self.WBITS, memLevel=9)
bytes = c.compress(s.encode())
bytes += c.flush(Z_FINISH)
c_len = len(bytes)
#c_len = len(compress(s.encode(), level=self.PRESET, wbits=self.WBITS))
return c_len / orig_len
def score_text(self, sample: str) -> Score | None:
'''
Returns a tuple of a string (AI or Human) and a float confidence (higher is more confident) that the sample was generated
by either an AI or human. Returns None if it cannot make a determination
'''
if self.prelude_ratio == 0.0:
return None
sample_score = self._compress(self.prelude_str + sample)
#print(str((self.prelude_ratio, sample_score)))
delta = self.prelude_ratio - sample_score
determination = 'AI'
if delta < 0:
determination = 'Human'
return (determination, abs(delta * 100))
class LzmaLlmDetector(AIDetector):
'''Class providing functionality to attempt to detect LLM/generative AI generated text using the LZMA compression algorithm'''
def __init__(self, prelude_file : Optional[str] = None, fuzziness_digits : int = 3, prelude_str : Optional[str] = None, prelude_ratio : Optional[float] = None) -> None:
def __init__(self, prelude_file : Optional[str] = None, prelude_str : Optional[str] = None, prelude_ratio : Optional[float] = None) -> None:
'''Initializes a compression with the passed prelude file, and optionally the number of digits to round to compare prelude vs. sample compression'''
self.PRESET : int = 2
self.comp = lzma.LZMACompressor(preset=self.PRESET)
@ -43,7 +104,6 @@ class LzmaLlmDetector:
self.prelude_ratio : float = 0.0
if prelude_ratio != None:
self.prelude_ratio = prelude_ratio
self.FUZZINESS_THRESHOLD = fuzziness_digits
self.SHORT_SAMPLE_THRESHOLD : int = 350 # What sample length is considered "short"
if prelude_file != None:
@ -102,39 +162,39 @@ class LzmaLlmDetector:
if self.prelude_ratio == 0.0:
return None
(prelude_score, sample_score) = self.get_compression_ratio(sample)
#print(str((prelude_score, sample_score)))
print(str((self.prelude_ratio, sample_score)))
delta = prelude_score - sample_score
determination = 'AI'
if delta < 0:
determination = 'Human'
# If the sample doesn't 'move the needle', it's very close
# if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) >= self.SHORT_SAMPLE_THRESHOLD:
# #print('Sample len to default to AI: ' + str(len(sample)))
# determination = 'AI'
# if round(delta, self.FUZZINESS_THRESHOLD) == 0 and len(sample) < self.SHORT_SAMPLE_THRESHOLD:
# #print('Sample len to default to Human: ' + str(len(sample)))
# determination = 'Human'
#if abs(delta * 100) < .1 and determination == 'AI':
# print("Very low-confidence determination of: " + determination)
return (determination, abs(delta * 100))
def run_on_file(filename : str, fuzziness : int = 3) -> Optional[Score]:
def run_on_file(filename : str) -> Optional[Score]:
'''Given a filename (and an optional number of decimal places to round to) returns the score for the contents of that file'''
with open(filename, 'r') as fp:
l = LzmaLlmDetector(PRELUDE_FILE, fuzziness)
if ENGINE == CompressionEngine.LZMA:
l = LzmaLlmDetector(prelude_file=PRELUDE_FILE)
elif ENGINE == CompressionEngine.ZLIB:
l = ZlibLlmDetector(prelude_file=PRELUDE_FILE)
txt = fp.read()
#print('Calculating score for input of length ' + str(len(txt)))
return l.score_text(txt)
def _score_chunk(c : str, fuzziness : int = 3, prelude_file : Optional[str] = None, prelude_ratio : Optional[float] = None) -> Score:
def _score_chunk(c : str, prelude_file : Optional[str] = None, prelude_ratio : Optional[float] = None) -> Score:
if prelude_file != None:
l = LzmaLlmDetector(fuzziness_digits=fuzziness, prelude_file=prelude_file)
if ENGINE == CompressionEngine.LZMA:
l = LzmaLlmDetector(prelude_file=prelude_file)
if ENGINE == CompressionEngine.ZLIB:
l = ZlibLlmDetector(prelude_file=prelude_file)
else:
l = LzmaLlmDetector(fuzziness_digits=fuzziness, prelude_str=PRELUDE_STR, prelude_ratio=prelude_ratio)
if ENGINE == CompressionEngine.LZMA:
l = LzmaLlmDetector(prelude_str=PRELUDE_STR, prelude_ratio=prelude_ratio)
if ENGINE == CompressionEngine.ZLIB:
l = ZlibLlmDetector(prelude_str=PRELUDE_STR, prelude_ratio=prelude_ratio)
return l.score_text(c)
def run_on_file_chunked(filename : str, chunk_size : int = 1500, fuzziness : int = 3, prelude_ratio : Optional[float] = None) -> Optional[Score]:
def run_on_file_chunked(filename : str, chunk_size : int = 1500, prelude_ratio : Optional[float] = None) -> Optional[Score]:
'''
Given a filename (and an optional chunk size and number of decimal places to round to) returns the score for the contents of that file.
This function chunks the file into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
@ -142,9 +202,9 @@ def run_on_file_chunked(filename : str, chunk_size : int = 1500, fuzziness : int
'''
with open(filename, 'r') as fp:
contents = fp.read()
return run_on_text_chunked(contents, chunk_size, fuzziness=fuzziness, prelude_ratio=prelude_ratio)
return run_on_text_chunked(contents, chunk_size, prelude_ratio=prelude_ratio)
def run_on_text_chunked(s : str, chunk_size : int = 1500, fuzziness : int = 3, prelude_file : Optional[str] = None, prelude_ratio : Optional[float] = None) -> Optional[Score]:
def run_on_text_chunked(s : str, chunk_size : int = 1500, prelude_file : Optional[str] = None, prelude_ratio : Optional[float] = None) -> Optional[Score]:
'''
Given a string (and an optional chunk size and number of decimal places to round to) returns the score for the passed string.
This function chunks the input into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
@ -163,11 +223,11 @@ def run_on_text_chunked(s : str, chunk_size : int = 1500, fuzziness : int = 3, p
scores = []
if len(chunks) > 2:
with Pool(cpu_count()) as pool:
for r in pool.starmap(_score_chunk, zip(chunks, itertools.repeat(fuzziness), itertools.repeat(prelude_file), itertools.repeat(prelude_ratio))):
for r in pool.starmap(_score_chunk, zip(chunks, itertools.repeat(prelude_file), itertools.repeat(prelude_ratio))):
scores.append(r)
else:
for c in chunks:
scores.append(_score_chunk(c, fuzziness=fuzziness, prelude_file=prelude_file, prelude_ratio=prelude_ratio))
scores.append(_score_chunk(c, prelude_file=prelude_file, prelude_ratio=prelude_ratio))
ssum : float = 0.0
for i, s in enumerate(scores):
if s[0] == 'AI':
@ -183,10 +243,16 @@ def run_on_text_chunked(s : str, chunk_size : int = 1500, fuzziness : int = 3, p
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-e", choices=['zlib', 'lzma'], help='Which compression engine to use: lzma or zlib', default='lzma', required=False)
group = parser.add_mutually_exclusive_group()
group.add_argument("-s", help='Read from stdin until EOF is reached instead of from a file', required=False, action='store_true')
group.add_argument("sample_files", nargs='*', help='Text file(s) containing the sample to classify', default="")
args = parser.parse_args()
if args.e:
if args.e == 'lzma':
ENGINE = CompressionEngine.LZMA
elif args.e == 'zlib':
ENGINE = CompressionEngine.ZLIB
if args.s:
print(str(run_on_text_chunked(''.join(list(sys.stdin)))))
elif len(args.sample_files) == 0: