Merge pull request #390 from transitive-bullshit/feature/rust-wasm-tokenizer

pull/403/head
Travis Fischer 2023-02-28 03:43:11 -06:00 zatwierdzone przez GitHub
commit aaa482b5f0
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: 4AEE18F83AFDEB23
3 zmienionych plików z 12 dodań i 23 usunięć

Wyświetl plik

@ -37,10 +37,10 @@
"test:prettier": "prettier '**/*.{js,jsx,ts,tsx}' --check"
},
"dependencies": {
"@dqbd/tiktoken": "^0.2.1",
"cac": "^6.7.14",
"conf": "^11.0.1",
"eventsource-parser": "^0.0.5",
"gpt3-tokenizer": "^1.1.5",
"keyv": "^4.5.2",
"p-timeout": "^6.0.0",
"quick-lru": "^6.1.1",

Wyświetl plik

@ -1,6 +1,7 @@
lockfileVersion: 5.4
specifiers:
'@dqbd/tiktoken': ^0.2.1
'@keyv/redis': ^2.5.4
'@trivago/prettier-plugin-sort-imports': ^4.0.0
'@types/node': ^18.11.9
@ -10,7 +11,6 @@ specifiers:
del-cli: ^5.0.0
dotenv-safe: ^8.2.0
eventsource-parser: ^0.0.5
gpt3-tokenizer: ^1.1.5
husky: ^8.0.2
keyv: ^4.5.2
lint-staged: ^13.0.3
@ -28,10 +28,10 @@ specifiers:
uuid: ^9.0.0
dependencies:
'@dqbd/tiktoken': 0.2.1
cac: 6.7.14
conf: 11.0.1
eventsource-parser: 0.0.5
gpt3-tokenizer: 1.1.5
keyv: 4.5.2
p-timeout: 6.1.0
quick-lru: 6.1.1
@ -300,6 +300,10 @@ packages:
to-fast-properties: 2.0.0
dev: true
/@dqbd/tiktoken/0.2.1:
resolution: {integrity: sha512-Nw9Swn37xZLAvz64qA3tTxy4yJLMhYDj7dWS6uSoHkUJxTn+BcYA+r06O36Q3Jya52b3SvK/LDXzl1dVeHqrew==}
dev: false
/@esbuild-kit/cjs-loader/2.4.1:
resolution: {integrity: sha512-lhc/XLith28QdW0HpHZvZKkorWgmCNT7sVelMHDj3HFdTfdqkwEKvT+aXVQtNAmCC39VJhunDkWhONWB7335mg==}
dependencies:
@ -525,10 +529,6 @@ packages:
picomatch: 2.3.1
dev: true
/array-keyed-map/2.1.3:
resolution: {integrity: sha512-JIUwuFakO+jHjxyp4YgSiKXSZeC0U+R1jR94bXWBcVlFRBycqXlb+kH9JHxBGcxnVuSqx5bnn0Qz9xtSeKOjiA==}
dev: false
/array-union/2.1.0:
resolution: {integrity: sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==}
engines: {node: '>=8'}
@ -1444,13 +1444,6 @@ packages:
get-intrinsic: 1.2.0
dev: true
/gpt3-tokenizer/1.1.5:
resolution: {integrity: sha512-O9iCL8MqGR0Oe9wTh0YftzIbysypNQmS5a5JG3cB3M4LMYjlAVvNnf8LUzVY9MrI7tj+YLY356uHtO2lLX2HpA==}
engines: {node: '>=12'}
dependencies:
array-keyed-map: 2.1.3
dev: false
/graceful-fs/4.2.10:
resolution: {integrity: sha512-9ByhssR2fPVsNZj478qUUbKfmL0+t5BDVyjShtyZZLiK7ZDAArFFfopyOTj0M05wE2tJPisA4iTnnXl2YoPvOA==}
dev: true

Wyświetl plik

@ -1,12 +1,8 @@
import GPT3TokenizerImport from 'gpt3-tokenizer'
import { encoding_for_model } from '@dqbd/tiktoken'
const GPT3Tokenizer: typeof GPT3TokenizerImport =
typeof GPT3TokenizerImport === 'function'
? GPT3TokenizerImport
: (GPT3TokenizerImport as any).default
// TODO: make this configurable
const tokenizer = encoding_for_model('text-davinci-003')
export const tokenizer = new GPT3Tokenizer({ type: 'gpt3' })
export function encode(input: string): number[] {
return tokenizer.encode(input).bpe
export function encode(input: string): Uint32Array {
return tokenizer.encode(input)
}