switch tokenizer implementation with pure js and more compatible js-tiktoken

pull/571/head
Claudio Poli 2023-05-17 04:52:37 +02:00
rodzic f39279cef9
commit 3ec62b89b2
3 zmienionych plików z 13 dodań i 12 usunięć

Wyświetl plik

@ -37,10 +37,10 @@
"test:prettier": "prettier '**/*.{js,jsx,ts,tsx}' --check"
},
"dependencies": {
"@dqbd/tiktoken": "^1.0.7",
"cac": "^6.7.14",
"conf": "^11.0.1",
"eventsource-parser": "^1.0.0",
"js-tiktoken": "^1.0.5",
"keyv": "^4.5.2",
"p-timeout": "^6.1.1",
"quick-lru": "^6.1.1",

Wyświetl plik

@ -1,9 +1,6 @@
lockfileVersion: '6.0'
dependencies:
'@dqbd/tiktoken':
specifier: ^1.0.7
version: 1.0.7
cac:
specifier: ^6.7.14
version: 6.7.14
@ -13,6 +10,9 @@ dependencies:
eventsource-parser:
specifier: ^1.0.0
version: 1.0.0
js-tiktoken:
specifier: ^1.0.5
version: 1.0.5
keyv:
specifier: ^4.5.2
version: 4.5.2
@ -192,10 +192,6 @@ packages:
to-fast-properties: 2.0.0
dev: true
/@dqbd/tiktoken@1.0.7:
resolution: {integrity: sha512-bhR5k5W+8GLzysjk8zTMVygQZsgvf7W1F0IlL4ZQ5ugjo5rCyiwGM5d8DYriXspytfu98tv59niang3/T+FoDw==}
dev: false
/@esbuild-kit/cjs-loader@2.4.2:
resolution: {integrity: sha512-BDXFbYOJzT/NBEtp71cvsrGPwGAMGRB/349rwKuoxNSiKjPraNNnlK6MIIabViCjqZugu6j+xeMDlEkWdHHJSg==}
dependencies:
@ -809,7 +805,6 @@ packages:
/base64-js@1.5.1:
resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
dev: true
/binary-extensions@2.2.0:
resolution: {integrity: sha512-jDctJ/IVQbZoJykoeHbhXpOlNBqGNcwXJKJog42E5HDPUwQTSdjCHdihjj0DlnheQ7blbT6dHOafNAiS8ooQKA==}
@ -1831,6 +1826,12 @@ packages:
engines: {node: '>=10'}
dev: true
/js-tiktoken@1.0.5:
resolution: {integrity: sha512-RYXe54ntls/uQmAxUua2J1+g+EiwWHGn1CxfioYxrP1iVDmksfZsyJt0VySyMNbreJyyreDtyBuBxeXy7HYqjQ==}
dependencies:
base64-js: 1.5.1
dev: false
/js-tokens@4.0.0:
resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==}

Wyświetl plik

@ -1,8 +1,8 @@
import { get_encoding } from '@dqbd/tiktoken'
import { getEncoding } from 'js-tiktoken'
// TODO: make this configurable
const tokenizer = get_encoding('cl100k_base')
const tokenizer = getEncoding('cl100k_base')
export function encode(input: string): Uint32Array {
return tokenizer.encode(input)
return new Uint32Array(tokenizer.encode(input))
}