diff --git a/lzma_detect.py b/lzma_detect.py index 51d8ea9..34bc876 100644 --- a/lzma_detect.py +++ b/lzma_detect.py @@ -4,7 +4,7 @@ # (C) 2023 Thinkst Applied Research, PTY # Author: Jacob Torrey -import lzma, argparse +import lzma, argparse, os from typing import List, Optional, Tuple # The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary @@ -70,15 +70,20 @@ class LzmaLlmDetector: determination = 'AI' if delta < 0 or round(delta, self.FUZZINESS_THRESHOLD) == 0: determination = 'Human' + if abs(delta * 100) < .1 and determination == 'AI': + print("Very low-confidence determination of: " + determination) return (determination, abs(delta * 100)) -def run_on_file(filename : str) -> Optional[Tuple[str, float]]: +def run_on_file(filename : str, fuzziness : int = 3) -> Optional[Tuple[str, float]]: with open(filename, 'r') as fp: - l = LzmaLlmDetector(PRELUDE_FILE) + l = LzmaLlmDetector(PRELUDE_FILE, fuzziness) return l.score_text(fp.read()) if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument("sample_file", help='Text file containing the sample to classify') + parser.add_argument("sample_files", nargs='+', help='Text file(s) containing the sample to classify') args = parser.parse_args() - print(str(run_on_file(args.sample_file))) \ No newline at end of file + for f in args.sample_files: + print(f) + if os.path.isfile(f): + print(str(run_on_file(f))) \ No newline at end of file diff --git a/samples/human-generated/lzma_readme.txt b/samples/human-generated/lzma_readme.txt new file mode 100644 index 0000000..cc0a004 --- /dev/null +++ b/samples/human-generated/lzma_readme.txt @@ -0,0 +1,6 @@ +This is the first attempt, using the LZMA compression ratios as a way to indirectly measure the perplexity of a text. Compression ratios have been used in the past to detect anomalies +in network data for intrusion detection, so if perplexity is roughly a measure of anomalous tokens, it may be possible to use compression to detect low-perplexity text. LZMA creates +a dictionary of seen tokens, and then uses though in place of future tokens. The dictionary size, token length, etc. are all dynamic (though influenced by the 'preset' of 0-9--with 0 +being the fastest but worse compression than 9). The basic idea is to 'seed' an LZMA compression stream with a corpus of AI-generated text (ai-generated.txt) and then measure the +compression ratio of just the seed data with that of the sample appended. Samples that follow more closely in word choice, structure, etc. will acheive a higher compression ratio due +to the prevalence of similar tokens in the dictionary, novel words, structures, etc. will appear anomalous to the seeded dictionary, resulting in a worse compression ratio. \ No newline at end of file diff --git a/test_lzma_detect.py b/test_lzma_detect.py index ac2fae8..1a28aa3 100644 --- a/test_lzma_detect.py +++ b/test_lzma_detect.py @@ -3,5 +3,21 @@ import pytest, os from lzma_detect import run_on_file -def test_corpus(): - assert run_on_file('ai-generated.txt')[0] == 'AI', 'The training corpus should always be detected as AI-generated... since it is' \ No newline at end of file +AI_SAMPLE_DIR = 'samples/llm-generated/' +HUMAN_SAMPLE_DIR = 'samples/human-generated/' + +ai_files = os.listdir(AI_SAMPLE_DIR) +human_files = os.listdir(HUMAN_SAMPLE_DIR) + +def test_training_file(): + assert run_on_file('ai-generated.txt')[0] == 'AI', 'The training corpus should always be detected as AI-generated... since it is' + +@pytest.mark.parametrize('f', human_files) +def test_human_samples(f): + (classification, score) = run_on_file(HUMAN_SAMPLE_DIR + f) + assert classification == 'Human', f + ' is a human-generated file, misclassified as AI-generated with confidence ' + str(score) + +@pytest.mark.parametrize('f', ai_files) +def test_llm_sample(f): + (classification, score) = run_on_file(AI_SAMPLE_DIR + f) + assert classification == 'AI', f + ' is an LLM-generated file, misclassified as human-generated with confidence ' + str(score) \ No newline at end of file