kopia lustrzana https://github.com/thinkst/zippy
Added OpenAI connector and pytest harness
Signed-off-by: Jacob Torrey <jacob@thinkst.com>pull/6/head
rodzic
d1f5562602
commit
a9be80e94b
|
@ -0,0 +1,46 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import os, requests
|
||||||
|
from typing import Optional, Dict, Tuple
|
||||||
|
|
||||||
|
MODEL_NAME = 'model-detect-v2'
|
||||||
|
API_KEY = os.getenv('OPENAI_API_KEY')
|
||||||
|
API_URL = 'https://api.openai.com/v1/completions'
|
||||||
|
|
||||||
|
def make_req(text : str) -> Optional[Dict]:
|
||||||
|
if len(text) < 1000:
|
||||||
|
print("Input too short for OpenAI to classify")
|
||||||
|
return None
|
||||||
|
headers = {
|
||||||
|
'authorization': 'Bearer ' + API_KEY,
|
||||||
|
'origin': 'https://platform.openai.com',
|
||||||
|
'openai-organization': 'org-gxAZne8U4jJ8pb632XJBLH1i'
|
||||||
|
}
|
||||||
|
data = {
|
||||||
|
'prompt': text + '<disc_score|>',
|
||||||
|
'max_tokens': 1,
|
||||||
|
'temperature': 1,
|
||||||
|
'top_p': 1,
|
||||||
|
'n': 1,
|
||||||
|
'model': MODEL_NAME,
|
||||||
|
'stream': False,
|
||||||
|
'stop': '\\n',
|
||||||
|
'logprobs': 5
|
||||||
|
}
|
||||||
|
res = requests.post(API_URL, headers=headers, json=data)
|
||||||
|
return res.json().get('choices', [None])[0]
|
||||||
|
|
||||||
|
def run_on_file(fn : str) -> Optional[Tuple[str, float]]:
|
||||||
|
with open(fn, 'r') as fp:
|
||||||
|
contents = fp.read()
|
||||||
|
res = make_req(contents)
|
||||||
|
if res is None:
|
||||||
|
print("Unable to classify!")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
#print(res)
|
||||||
|
if res.get('text') == '"':
|
||||||
|
return ('AI', abs(res.get('logprobs').get('token_logprobs')[0]))
|
||||||
|
elif res.get('text') == '!':
|
||||||
|
return ('Human', abs(res.get('logprobs').get('token_logprobs')[0]))
|
||||||
|
return None #res.get('text')
|
|
@ -0,0 +1,26 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import pytest, os
|
||||||
|
from warnings import warn
|
||||||
|
from openai_detect import run_on_file
|
||||||
|
|
||||||
|
AI_SAMPLE_DIR = 'samples/llm-generated/'
|
||||||
|
HUMAN_SAMPLE_DIR = 'samples/human-generated/'
|
||||||
|
|
||||||
|
ai_files = os.listdir(AI_SAMPLE_DIR)
|
||||||
|
ai_files = filter(lambda f: os.path.getsize(AI_SAMPLE_DIR + f) >= 1000, ai_files)
|
||||||
|
human_files = os.listdir(HUMAN_SAMPLE_DIR)
|
||||||
|
human_files = filter(lambda f: os.path.getsize(HUMAN_SAMPLE_DIR + f) >= 1000, human_files)
|
||||||
|
|
||||||
|
def test_training_file():
|
||||||
|
assert run_on_file('ai-generated.txt')[0] == 'AI', 'The training corpus should always be detected as AI-generated... since it is'
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('f', human_files)
|
||||||
|
def test_human_samples(f):
|
||||||
|
(classification, score) = run_on_file(HUMAN_SAMPLE_DIR + f)
|
||||||
|
assert classification == 'Human', f + ' is a human-generated file, misclassified as AI-generated with confidence ' + str(round(score, 8))
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('f', ai_files)
|
||||||
|
def test_llm_sample(f):
|
||||||
|
(classification, score) = run_on_file(AI_SAMPLE_DIR + f)
|
||||||
|
assert classification == 'AI', f + ' is an LLM-generated file, misclassified as human-generated with confidence ' + str(round(score, 8))
|
Ładowanie…
Reference in New Issue