Ensure the ai-generated.txt is included in the built package

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
pull/6/head v0.1.1
Jacob Torrey 2023-10-27 17:21:58 +00:00
rodzic 03cde408af
commit b4faecfd92
5 zmienionych plików z 30 dodań i 14 usunięć

Wyświetl plik

@ -29,9 +29,17 @@ Here are each of them compared with both the LZMA and zlib detector across the t
### Usage ### Usage
ZipPy will read files passed as command-line arguments, or will read from stdin to allow for piping of text to it. ZipPy will read files passed as command-line arguments, or will read from stdin to allow for piping of text to it.
First, build and install the tool:
``` ```
$ python3 zippy/zippy.py -h $ python3 setup.py build && python3 setup.py install
usage: zippy.py [-h] [-p P] [-e {zlib,lzma,brotli,ensemble}] [-s | sample_files ...] ```
It will install a new script (`zippy`) that you can use directly:
```
$ zippy -h
usage: zippy [-h] [-p P] [-e {zlib,lzma,brotli,ensemble}] [-s | sample_files ...]
positional arguments: positional arguments:
sample_files Text file(s) containing the sample to classify sample_files Text file(s) containing the sample to classify
@ -42,7 +50,7 @@ options:
-e {zlib,lzma,brotli,ensemble} -e {zlib,lzma,brotli,ensemble}
Which compression engine to use: lzma, zlib, brotli, or an ensemble of all engines Which compression engine to use: lzma, zlib, brotli, or an ensemble of all engines
-s Read from stdin until EOF is reached instead of from a file -s Read from stdin until EOF is reached instead of from a file
$ python3 zippy/zippy.py samples/human-generated/about_me.txt $ zippy samples/human-generated/about_me.txt
samples/human-generated/about_me.txt samples/human-generated/about_me.txt
('Human', 0.06013429262166636) ('Human', 0.06013429262166636)
``` ```

Wyświetl plik

@ -2,8 +2,9 @@ from setuptools import setup
setup( setup(
name='ZipPy setup file', name='ZipPy setup file',
version='0.1.2', version='0.1.1',
packages=['zippy'], packages=['zippy'],
package_data={"": ["*.txt"]},
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'zippy=zippy.zippy:main', 'zippy=zippy.zippy:main',

Wyświetl plik

@ -37,7 +37,7 @@ else:
PRELUDE_RATIO = None PRELUDE_RATIO = None
def test_training_file(record_property): def test_training_file(record_property):
(classification, score) = zippy.run_on_file_chunked('ai-generated.txt') (classification, score) = zippy.run_on_file_chunked('zippy/ai-generated.txt')
record_property("score", str(score)) record_property("score", str(score))
assert classification == 'AI', 'The training corpus should always be detected as AI-generated... since it is' assert classification == 'AI', 'The training corpus should always be detected as AI-generated... since it is'

Wyświetl plik

@ -14,6 +14,7 @@ from enum import Enum
from math import ceil from math import ceil
from typing import List, Optional, Tuple, TypeAlias from typing import List, Optional, Tuple, TypeAlias
from multiprocessing import Pool, cpu_count from multiprocessing import Pool, cpu_count
from importlib.resources import files
Score : TypeAlias = tuple[str, float] Score : TypeAlias = tuple[str, float]
@ -40,8 +41,7 @@ def clean_text(s : str) -> str:
# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary # The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
PRELUDE_FILE : str = 'ai-generated.txt' PRELUDE_FILE : str = 'ai-generated.txt'
with open(PRELUDE_FILE, 'r', encoding='utf-8') as fp: PRELUDE_STR = clean_text(files('zippy').joinpath(PRELUDE_FILE).read_text())
PRELUDE_STR = clean_text(fp.read())
class AIDetector(ABC): class AIDetector(ABC):
''' '''
@ -160,6 +160,7 @@ class LzmaLlmDetector(AIDetector):
#print(prelude_file + ' ratio: ' + str(self.prelude_ratio)) #print(prelude_file + ' ratio: ' + str(self.prelude_ratio))
if prelude_str != None: if prelude_str != None:
self.prelude_str = prelude_str
if self.prelude_ratio == 0.0: if self.prelude_ratio == 0.0:
self.prelude_ratio = self._compress(prelude_str) self.prelude_ratio = self._compress(prelude_str)
@ -193,22 +194,28 @@ class Zippy:
def __init__(self, engine : CompressionEngine = CompressionEngine.LZMA, preset : Optional[int] = None, prelude_file : str = PRELUDE_FILE) -> None: def __init__(self, engine : CompressionEngine = CompressionEngine.LZMA, preset : Optional[int] = None, prelude_file : str = PRELUDE_FILE) -> None:
self.ENGINE = engine self.ENGINE = engine
self.PRESET = preset self.PRESET = preset
self.PRELUDE_FILE = prelude_file if prelude_file == PRELUDE_FILE:
self.PRELUDE_FILE = str(files('zippy').joinpath(PRELUDE_FILE))
self.PRELUDE_STR = clean_text(files('zippy').joinpath(PRELUDE_FILE).read_text())
else:
self.PRELUDE_FILE = prelude_file
with open(self.PRELUDE_FILE, encoding='utf-8') as fp:
self.PRELUDE_STR = clean_text(fp.read())
if engine == CompressionEngine.LZMA: if engine == CompressionEngine.LZMA:
if self.PRESET: if self.PRESET:
self.detector = LzmaLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET) self.detector = LzmaLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
else: else:
self.detector = LzmaLlmDetector(prelude_file=self.PRELUDE_FILE) self.detector = LzmaLlmDetector(prelude_str=self.PRELUDE_STR)
elif engine == CompressionEngine.BROTLI: elif engine == CompressionEngine.BROTLI:
if self.PRESET: if self.PRESET:
self.detector = BrotliLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET) self.detector = BrotliLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
else: else:
self.detector = BrotliLlmDetector(prelude_file=self.PRELUDE_FILE) self.detector = BrotliLlmDetector(prelude_str=self.PRELUDE_STR)
elif engine == CompressionEngine.ZLIB: elif engine == CompressionEngine.ZLIB:
if self.PRESET: if self.PRESET:
self.detector = ZlibLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET) self.detector = ZlibLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
else: else:
self.detector = ZlibLlmDetector(prelude_file=self.PRELUDE_FILE) self.detector = ZlibLlmDetector(prelude_str=self.PRELUDE_STR)
def run_on_file(self, filename : str) -> Optional[Score]: def run_on_file(self, filename : str) -> Optional[Score]:
'''Given a filename (and an optional number of decimal places to round to) returns the score for the contents of that file''' '''Given a filename (and an optional number of decimal places to round to) returns the score for the contents of that file'''