diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..b93c02a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "nlzmadetect/LZMA-JS"] + path = nlzmadetect/LZMA-JS + url = git@github.com:LZMA-JS/LZMA-JS.git diff --git a/nlzmadetect/LZMA-JS b/nlzmadetect/LZMA-JS new file mode 160000 index 0000000..8f98fe8 --- /dev/null +++ b/nlzmadetect/LZMA-JS @@ -0,0 +1 @@ +Subproject commit 8f98fe85a1ef78ab6e9d26ab85ce338f50095a23 diff --git a/nlzmadetect/aidetect.html b/nlzmadetect/aidetect.html new file mode 100644 index 0000000..a370fd2 --- /dev/null +++ b/nlzmadetect/aidetect.html @@ -0,0 +1,13 @@ + + + LLM text detector + + + +

LZMA-based LLM text classifier

+
+
+ The results will appear here! + + + \ No newline at end of file diff --git a/nlzmadetect/nlzmadetect.nimble b/nlzmadetect/nlzmadetect.nimble index 2da4039..b27b666 100644 --- a/nlzmadetect/nlzmadetect.nimble +++ b/nlzmadetect/nlzmadetect.nimble @@ -19,4 +19,7 @@ task debug, "Build a debug version of the CLI application": exec "nimble build --threads:on" task release, "Build a release version of the CLI application": - exec "nimble build -d:release --threads:on" \ No newline at end of file + exec "nimble build -d:release --threads:on" + +task buildjs, "Build a Javascript version of the application": + exec "nim js -d:release src/nlzmadetect.nim" \ No newline at end of file diff --git a/nlzmadetect/src/nlzmadetect.nim b/nlzmadetect/src/nlzmadetect.nim index 5d03212..b90183f 100644 --- a/nlzmadetect/src/nlzmadetect.nim +++ b/nlzmadetect/src/nlzmadetect.nim @@ -1,24 +1,45 @@ -import std/[re, math, threadpool] -import lzma -import encodings +when defined(c): + import std/[re, threadpool, encodings] + import lzma +when defined(js): + import std/[jsffi, jsre] + import dom +import std/math import strutils -when isMainModule: +when isMainModule and defined(c): import std/[parseopt, os] -const PRELUDE_FILE = "../../ai-generated.txt" const COMPRESSION_PRESET = 2.int32 const SHORT_SAMPLE_THRESHOLD = 350 + + +const PRELUDE_FILE = "../../ai-generated.txt" const PRELUDE_STR = staticRead(PRELUDE_FILE) - proc compress_str(s : string, preset = COMPRESSION_PRESET): float64 - var PRELUDE_RATIO = compress_str("") +when defined(js): + var console {.importc, nodecl.}: JsObject + proc compress(str : cstring, mode : int) : seq[byte] {.importjs: "LZMA.compress(#, #)".} + console.log("Initialized with a prelude compression ratio of: " & $PRELUDE_RATIO) + +# Target independent wrapper for LZMA compression +proc ti_compress(input : cstring, preset: int32, check: int32): seq[byte] = + when defined(c): + return compress(input, preset, check) + when defined(js): + return compress(input, preset) + proc compress_str(s : string, preset = COMPRESSION_PRESET): float64 = let in_len = PRELUDE_STR.len + s.len + var combined : string = PRELUDE_STR & s + when defined(c): combined = convert(PRELUDE_STR & s, "us-ascii", "UTF-8").replace(re"[^\x00-\x7F]") - out_len = compress(combined.cstring, preset, 0.int32).len + when defined(js): + let nonascii = newRegExp(r"[^\x00-\x7F]") + combined = $combined.cstring.replace(nonascii, "") + let out_len = ti_compress(combined.cstring, preset, 0.int32).len return out_len.toFloat / in_len.toFloat proc score_string*(s : string, fuzziness : int): (string, float64) = @@ -36,20 +57,27 @@ proc score_string*(s : string, fuzziness : int): (string, float64) = return (determination, abs(delta) * 100.0) -proc score_chunk(chunk : string, fuzziness : int): float64 = - var (d, s) = score_string(chunk, fuzziness) - if d == "AI": - return -1.0 * s - return s +when defined(c): + proc score_chunk(chunk : string, fuzziness : int): float64 = + var (d, s) = score_string(chunk, fuzziness) + if d == "AI": + return -1.0 * s + return s -proc run_on_file_chunked*(filename : string, chunk_size : int = 1024, fuzziness : int = 3): (string, float64) = - var inf = readFile(filename) - - inf = replace(inf, re" +", " ") - inf = replace(inf, re"\t") - inf = replace(inf, re"\n+", "\n") - inf = replace(inf, re"\n ", "\n") - inf = replace(inf, re" \n", "\n") +proc run_on_text_chunked*(text : string, chunk_size : int = 1024, fuzziness : int = 3): (string, float64) = + var inf : string = text + when defined(c): + inf = replace(inf, re" +", " ") + inf = replace(inf, re"\t") + inf = replace(inf, re"\n+", "\n") + inf = replace(inf, re"\n ", "\n") + inf = replace(inf, re" \n", "\n") + when defined(js): + inf = $inf.cstring.replace(newRegExp(r" +"), " ") + inf = $inf.cstring.replace(newRegExp(r"\t"), "") + inf = $inf.cstring.replace(newRegExp(r"\n+"), "\n") + inf = $inf.cstring.replace(newRegExp(r"\n "), "\n") + inf = $inf.cstring.replace(newRegExp(r" \n"), "\n") var start = 0 @@ -62,18 +90,24 @@ proc run_on_file_chunked*(filename : string, chunk_size : int = 1024, fuzziness chunks.add(inf[start..inf.len-1]) var scores : seq[(string, float64)] = @[] - var flows : seq[FlowVar[float64]] = @[] - for c in chunks: - flows.add(spawn score_chunk(c, fuzziness)) - for f in flows: - let score = ^f - var d : string = "Human" - if score < 0.0: - d = "AI" - scores.add((d, score * -1.0)) - else: - scores.add((d, score)) + when defined(c): + var flows : seq[FlowVar[float64]] = @[] + for c in chunks: + flows.add(spawn score_chunk(c, fuzziness)) + + for f in flows: + let score = ^f + var d : string = "Human" + if score < 0.0: + d = "AI" + scores.add((d, score * -1.0)) + else: + scores.add((d, score)) + when defined(js): + for c in chunks: + scores.add(score_string(c, fuzziness)) + var ssum : float64 = 0.0 for s in scores: if s[0] == "AI": @@ -86,11 +120,11 @@ proc run_on_file_chunked*(filename : string, chunk_size : int = 1024, fuzziness else: return ("Human", abs(sa)) -when isMainModule: +when isMainModule and defined(c): proc display_help() = echo "Call with one or more files to classify" -when isMainModule: +when defined(c) and isMainModule: var filenames : seq[string] = @[] parser = initOptParser() @@ -110,5 +144,12 @@ when isMainModule: for fn in filenames: if fileExists(fn): echo fn - let (d, s) = run_on_file_chunked(fn) + let (d, s) = run_on_text_chunked(readFile(fn)) echo "(" & d & ", " & $s.formatFloat(ffDecimal, 8) & ")" + +when defined(js) and isMainModule: + proc do_detect() {.exportc.} = + let + text : string = $document.getElementById("text_input").value + var (d, s) = run_on_text_chunked(text) + document.getElementById("output_span").textContent = d.cstring & ", confidence score of: " & ($s.round(6)).cstring