kopia lustrzana https://github.com/thinkst/zippy
Ensure the ai-generated.txt is included in the built package
Signed-off-by: Jacob Torrey <jacob@thinkst.com>pull/6/head v0.1.1
rodzic
03cde408af
commit
b4faecfd92
14
README.md
14
README.md
|
@ -29,9 +29,17 @@ Here are each of them compared with both the LZMA and zlib detector across the t
|
||||||
### Usage
|
### Usage
|
||||||
|
|
||||||
ZipPy will read files passed as command-line arguments, or will read from stdin to allow for piping of text to it.
|
ZipPy will read files passed as command-line arguments, or will read from stdin to allow for piping of text to it.
|
||||||
|
|
||||||
|
First, build and install the tool:
|
||||||
```
|
```
|
||||||
$ python3 zippy/zippy.py -h
|
$ python3 setup.py build && python3 setup.py install
|
||||||
usage: zippy.py [-h] [-p P] [-e {zlib,lzma,brotli,ensemble}] [-s | sample_files ...]
|
```
|
||||||
|
|
||||||
|
It will install a new script (`zippy`) that you can use directly:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ zippy -h
|
||||||
|
usage: zippy [-h] [-p P] [-e {zlib,lzma,brotli,ensemble}] [-s | sample_files ...]
|
||||||
|
|
||||||
positional arguments:
|
positional arguments:
|
||||||
sample_files Text file(s) containing the sample to classify
|
sample_files Text file(s) containing the sample to classify
|
||||||
|
@ -42,7 +50,7 @@ options:
|
||||||
-e {zlib,lzma,brotli,ensemble}
|
-e {zlib,lzma,brotli,ensemble}
|
||||||
Which compression engine to use: lzma, zlib, brotli, or an ensemble of all engines
|
Which compression engine to use: lzma, zlib, brotli, or an ensemble of all engines
|
||||||
-s Read from stdin until EOF is reached instead of from a file
|
-s Read from stdin until EOF is reached instead of from a file
|
||||||
$ python3 zippy/zippy.py samples/human-generated/about_me.txt
|
$ zippy samples/human-generated/about_me.txt
|
||||||
samples/human-generated/about_me.txt
|
samples/human-generated/about_me.txt
|
||||||
('Human', 0.06013429262166636)
|
('Human', 0.06013429262166636)
|
||||||
```
|
```
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -2,8 +2,9 @@ from setuptools import setup
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='ZipPy setup file',
|
name='ZipPy setup file',
|
||||||
version='0.1.2',
|
version='0.1.1',
|
||||||
packages=['zippy'],
|
packages=['zippy'],
|
||||||
|
package_data={"": ["*.txt"]},
|
||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
'zippy=zippy.zippy:main',
|
'zippy=zippy.zippy:main',
|
||||||
|
|
|
@ -37,7 +37,7 @@ else:
|
||||||
PRELUDE_RATIO = None
|
PRELUDE_RATIO = None
|
||||||
|
|
||||||
def test_training_file(record_property):
|
def test_training_file(record_property):
|
||||||
(classification, score) = zippy.run_on_file_chunked('ai-generated.txt')
|
(classification, score) = zippy.run_on_file_chunked('zippy/ai-generated.txt')
|
||||||
record_property("score", str(score))
|
record_property("score", str(score))
|
||||||
assert classification == 'AI', 'The training corpus should always be detected as AI-generated... since it is'
|
assert classification == 'AI', 'The training corpus should always be detected as AI-generated... since it is'
|
||||||
|
|
||||||
|
|
|
@ -14,6 +14,7 @@ from enum import Enum
|
||||||
from math import ceil
|
from math import ceil
|
||||||
from typing import List, Optional, Tuple, TypeAlias
|
from typing import List, Optional, Tuple, TypeAlias
|
||||||
from multiprocessing import Pool, cpu_count
|
from multiprocessing import Pool, cpu_count
|
||||||
|
from importlib.resources import files
|
||||||
|
|
||||||
Score : TypeAlias = tuple[str, float]
|
Score : TypeAlias = tuple[str, float]
|
||||||
|
|
||||||
|
@ -40,8 +41,7 @@ def clean_text(s : str) -> str:
|
||||||
|
|
||||||
# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
|
# The prelude file is a text file containing only AI-generated text, it is used to 'seed' the LZMA dictionary
|
||||||
PRELUDE_FILE : str = 'ai-generated.txt'
|
PRELUDE_FILE : str = 'ai-generated.txt'
|
||||||
with open(PRELUDE_FILE, 'r', encoding='utf-8') as fp:
|
PRELUDE_STR = clean_text(files('zippy').joinpath(PRELUDE_FILE).read_text())
|
||||||
PRELUDE_STR = clean_text(fp.read())
|
|
||||||
|
|
||||||
class AIDetector(ABC):
|
class AIDetector(ABC):
|
||||||
'''
|
'''
|
||||||
|
@ -160,6 +160,7 @@ class LzmaLlmDetector(AIDetector):
|
||||||
#print(prelude_file + ' ratio: ' + str(self.prelude_ratio))
|
#print(prelude_file + ' ratio: ' + str(self.prelude_ratio))
|
||||||
|
|
||||||
if prelude_str != None:
|
if prelude_str != None:
|
||||||
|
self.prelude_str = prelude_str
|
||||||
if self.prelude_ratio == 0.0:
|
if self.prelude_ratio == 0.0:
|
||||||
self.prelude_ratio = self._compress(prelude_str)
|
self.prelude_ratio = self._compress(prelude_str)
|
||||||
|
|
||||||
|
@ -193,22 +194,28 @@ class Zippy:
|
||||||
def __init__(self, engine : CompressionEngine = CompressionEngine.LZMA, preset : Optional[int] = None, prelude_file : str = PRELUDE_FILE) -> None:
|
def __init__(self, engine : CompressionEngine = CompressionEngine.LZMA, preset : Optional[int] = None, prelude_file : str = PRELUDE_FILE) -> None:
|
||||||
self.ENGINE = engine
|
self.ENGINE = engine
|
||||||
self.PRESET = preset
|
self.PRESET = preset
|
||||||
self.PRELUDE_FILE = prelude_file
|
if prelude_file == PRELUDE_FILE:
|
||||||
|
self.PRELUDE_FILE = str(files('zippy').joinpath(PRELUDE_FILE))
|
||||||
|
self.PRELUDE_STR = clean_text(files('zippy').joinpath(PRELUDE_FILE).read_text())
|
||||||
|
else:
|
||||||
|
self.PRELUDE_FILE = prelude_file
|
||||||
|
with open(self.PRELUDE_FILE, encoding='utf-8') as fp:
|
||||||
|
self.PRELUDE_STR = clean_text(fp.read())
|
||||||
if engine == CompressionEngine.LZMA:
|
if engine == CompressionEngine.LZMA:
|
||||||
if self.PRESET:
|
if self.PRESET:
|
||||||
self.detector = LzmaLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET)
|
self.detector = LzmaLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
|
||||||
else:
|
else:
|
||||||
self.detector = LzmaLlmDetector(prelude_file=self.PRELUDE_FILE)
|
self.detector = LzmaLlmDetector(prelude_str=self.PRELUDE_STR)
|
||||||
elif engine == CompressionEngine.BROTLI:
|
elif engine == CompressionEngine.BROTLI:
|
||||||
if self.PRESET:
|
if self.PRESET:
|
||||||
self.detector = BrotliLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET)
|
self.detector = BrotliLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
|
||||||
else:
|
else:
|
||||||
self.detector = BrotliLlmDetector(prelude_file=self.PRELUDE_FILE)
|
self.detector = BrotliLlmDetector(prelude_str=self.PRELUDE_STR)
|
||||||
elif engine == CompressionEngine.ZLIB:
|
elif engine == CompressionEngine.ZLIB:
|
||||||
if self.PRESET:
|
if self.PRESET:
|
||||||
self.detector = ZlibLlmDetector(prelude_file=self.PRELUDE_FILE, preset=self.PRESET)
|
self.detector = ZlibLlmDetector(prelude_str=self.PRELUDE_STR, preset=self.PRESET)
|
||||||
else:
|
else:
|
||||||
self.detector = ZlibLlmDetector(prelude_file=self.PRELUDE_FILE)
|
self.detector = ZlibLlmDetector(prelude_str=self.PRELUDE_STR)
|
||||||
|
|
||||||
def run_on_file(self, filename : str) -> Optional[Score]:
|
def run_on_file(self, filename : str) -> Optional[Score]:
|
||||||
'''Given a filename (and an optional number of decimal places to round to) returns the score for the contents of that file'''
|
'''Given a filename (and an optional number of decimal places to round to) returns the score for the contents of that file'''
|
||||||
|
|
Ładowanie…
Reference in New Issue