Add CUDA support for Roberta (local) and fix an alignment issue

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
pull/6/head
Jacob Torrey 2023-06-15 10:47:50 -06:00
rodzic 83dd797dcd
commit 700a6e3441
2 zmienionych plików z 27 dodań i 15 usunięć

Wyświetl plik

@ -7,7 +7,7 @@ from typing import Optional, Tuple
from roberta_local import classify_text
def run_on_file_chunked(filename : str, chunk_size : int = 1025, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
def run_on_file_chunked(filename : str, chunk_size : int = 800, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
'''
Given a filename (and an optional chunk size) returns the score for the contents of that file.
This function chunks the file into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
@ -17,7 +17,7 @@ def run_on_file_chunked(filename : str, chunk_size : int = 1025, fuzziness : int
contents = fp.read()
return run_on_text_chunked(contents, chunk_size, fuzziness)
def run_on_text_chunked(contents : str, chunk_size : int = 1025, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
def run_on_text_chunked(contents : str, chunk_size : int = 800, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
'''
Given a text (and an optional chunk size) returns the score for the contents of that string.
This function chunks the string into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
@ -35,12 +35,15 @@ def run_on_text_chunked(contents : str, chunk_size : int = 1025, fuzziness : int
chunks = []
while start + chunk_size < len(contents) and end != -1:
end = contents.rfind(' ', start, start + chunk_size + 1)
if end == -1:
end = contents.rfind('\n', start, start + chunk_size + 1)
if end == -1:
print("Unable to chunk naturally!")
end = start + chunk_size + 1
chunks.append(contents[start:end])
start = end + 1
chunks.append(contents[start:])
scores = []
for c in chunks:
scores.append(classify_text(c))
scores = classify_text(chunks)
ssum : float = 0.0
for s in scores:
if s[0] == 'AI':

Wyświetl plik

@ -1,17 +1,26 @@
#!/usr/bin/env python3
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from typing import List, Tuple
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
if torch.cuda.is_available():
DEVICE = 'cuda:0'
else:
DEVICE = 'cpu'
tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer, device=DEVICE)
def classify_text(s : List[str]) -> List[Tuple[str, float]]:
res = pipe(s)
out = []
for r in res:
label = r['label']
conf = r['score']
if label == 'Real':
out.append(('Human', conf))
out.append(('AI', conf))
return out
def classify_text(s : str):
inputs = tokenizer(s, return_tensors='pt')
with torch.no_grad():
logits = model(**inputs).logits
pc = model.config.id2label[logits.argmax().item()]
conf = max(torch.softmax(logits, dim=1).tolist()[0])
if pc == 'Real':
return ('Human', conf)
return ('AI', conf)