Added OpenAI connector and pytest harness

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
2023-05-15 13:37:16 -06:00 · 2023-05-15 13:37:16 -06:00 · a9be80e94b
commit a9be80e94b
--- a/openai_detect.py
+++ b/openai_detect.py
@ -0,0 +1,46 @@
 #!/usr/bin/env python3
 import os, requests
 from typing import Optional, Dict, Tuple
 MODEL_NAME = 'model-detect-v2'
 API_KEY = os.getenv('OPENAI_API_KEY')
 API_URL = 'https://api.openai.com/v1/completions'
 def make_req(text : str) -> Optional[Dict]:
    if len(text) < 1000:
        print("Input too short for OpenAI to classify")
        return None
    headers = {
        'authorization': 'Bearer ' + API_KEY,
        'origin': 'https://platform.openai.com',
        'openai-organization': 'org-gxAZne8U4jJ8pb632XJBLH1i'
    }
    data = {
        'prompt': text + '<disc_score|>',
        'max_tokens': 1,
        'temperature': 1,
        'top_p': 1,
        'n': 1,
        'model': MODEL_NAME,
        'stream': False,
        'stop': '\\n',
        'logprobs': 5
    }
    res = requests.post(API_URL, headers=headers, json=data)
    return res.json().get('choices', [None])[0]
 def run_on_file(fn : str) -> Optional[Tuple[str, float]]:
    with open(fn, 'r') as fp:
        contents = fp.read()
    res = make_req(contents)
    if res is None:
        print("Unable to classify!")
        return None
    else:
        #print(res)
        if res.get('text') == '"':
            return ('AI', abs(res.get('logprobs').get('token_logprobs')[0]))
        elif res.get('text') == '!':
            return ('Human', abs(res.get('logprobs').get('token_logprobs')[0]))
        return None #res.get('text')
--- a/test_openai_detect.py
+++ b/test_openai_detect.py
@ -0,0 +1,26 @@
 #!/usr/bin/env python3
 import pytest, os
 from warnings import warn
 from openai_detect import run_on_file
 AI_SAMPLE_DIR = 'samples/llm-generated/'
 HUMAN_SAMPLE_DIR = 'samples/human-generated/'
 ai_files = os.listdir(AI_SAMPLE_DIR)
 ai_files = filter(lambda f: os.path.getsize(AI_SAMPLE_DIR + f) >= 1000, ai_files)
 human_files = os.listdir(HUMAN_SAMPLE_DIR)
 human_files = filter(lambda f: os.path.getsize(HUMAN_SAMPLE_DIR + f) >= 1000, human_files)
 def test_training_file():
    assert run_on_file('ai-generated.txt')[0] == 'AI', 'The training corpus should always be detected as AI-generated... since it is'
@pytest.mark.parametrize('f', human_files)
 def test_human_samples(f):
    (classification, score) = run_on_file(HUMAN_SAMPLE_DIR + f)
    assert classification == 'Human', f + ' is a human-generated file, misclassified as AI-generated with confidence ' + str(round(score, 8))
@pytest.mark.parametrize('f', ai_files)
 def test_llm_sample(f):
    (classification, score) = run_on_file(AI_SAMPLE_DIR + f)
    assert classification == 'AI', f + ' is an LLM-generated file, misclassified as human-generated with confidence ' + str(round(score, 8))