Add CUDA support for Roberta (local) and fix an alignment issue

Signed-off-by: Jacob Torrey <jacob@thinkst.com>
2023-06-15 10:47:50 -06:00 · 2023-06-15 10:47:50 -06:00 · 700a6e3441
commit 700a6e3441
--- a/roberta_detect.py
+++ b/roberta_detect.py
@ -7,7 +7,7 @@ from typing import Optional, Tuple

 from roberta_local import classify_text

-def run_on_file_chunked(filename : str, chunk_size : int = 1025, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
+def run_on_file_chunked(filename : str, chunk_size : int = 800, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
 	'''
 	Given a filename (and an optional chunk size) returns the score for the contents of that file.
 	This function chunks the file into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
@ -17,7 +17,7 @@ def run_on_file_chunked(filename : str, chunk_size : int = 1025, fuzziness : int
 		contents = fp.read()
 	return run_on_text_chunked(contents, chunk_size, fuzziness)

-def run_on_text_chunked(contents : str, chunk_size : int = 1025, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
+def run_on_text_chunked(contents : str, chunk_size : int = 800, fuzziness : int = 3) -> Optional[Tuple[str, float]]:
 	'''
 	Given a text (and an optional chunk size) returns the score for the contents of that string.
 	This function chunks the string into at most chunk_size parts to score separately, then returns an average. This prevents a very large input
@ -35,12 +35,15 @@ def run_on_text_chunked(contents : str, chunk_size : int = 1025, fuzziness : int
 	chunks = []
 	while start + chunk_size < len(contents) and end != -1:
 		end = contents.rfind(' ', start, start + chunk_size + 1)
+		if end == -1:
+			end = contents.rfind('\n', start, start + chunk_size + 1)
+		if end == -1:
+			print("Unable to chunk naturally!")
+			end = start + chunk_size + 1
 		chunks.append(contents[start:end])
 		start = end + 1
 	chunks.append(contents[start:])
-	scores = []
-	for c in chunks:
-		scores.append(classify_text(c))
+	scores = classify_text(chunks)
 	ssum : float = 0.0
 	for s in scores:
 		if s[0] == 'AI':
--- a/roberta_local.py
+++ b/roberta_local.py
@ -1,17 +1,26 @@
 #!/usr/bin/env python3

-from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from typing import List, Tuple
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import torch

+if torch.cuda.is_available():
+    DEVICE = 'cuda:0'
+else:
+    DEVICE = 'cpu'
+
 tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
 model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
+pipe = pipeline('text-classification', model=model, tokenizer=tokenizer, device=DEVICE)
+
+def classify_text(s : List[str]) -> List[Tuple[str, float]]: 
+    res = pipe(s)
+    out = []
+    for r in res:
+        label = r['label']
+        conf = r['score']
+        if label == 'Real':
+            out.append(('Human', conf))
+        out.append(('AI', conf))
+    return out

-def classify_text(s : str):
-    inputs = tokenizer(s, return_tensors='pt')
-    with torch.no_grad():
-        logits = model(**inputs).logits
-    pc = model.config.id2label[logits.argmax().item()]
-    conf = max(torch.softmax(logits, dim=1).tolist()[0])
-    if pc == 'Real':
-        return ('Human', conf)
-    return ('AI', conf)