feat: switch from gpt-3-encoder to gpt3-tokenizer

feature/cli
Travis Fischer 2023-02-19 03:48:06 -06:00
rodzic fc9869abf5
commit d8eeb1a736
4 zmienionych plików z 26 dodań i 7 usunięć

Wyświetl plik

@ -36,7 +36,7 @@
},
"dependencies": {
"eventsource-parser": "^0.0.5",
"gpt-3-encoder": "^1.1.4",
"gpt3-tokenizer": "^1.1.5",
"keyv": "^4.5.2",
"p-timeout": "^6.0.0",
"quick-lru": "^6.1.1",

Wyświetl plik

@ -8,7 +8,7 @@ specifiers:
del-cli: ^5.0.0
dotenv-safe: ^8.2.0
eventsource-parser: ^0.0.5
gpt-3-encoder: ^1.1.4
gpt3-tokenizer: ^1.1.5
husky: ^8.0.2
keyv: ^4.5.2
lint-staged: ^13.0.3
@ -26,7 +26,7 @@ specifiers:
dependencies:
eventsource-parser: 0.0.5
gpt-3-encoder: 1.1.4
gpt3-tokenizer: 1.1.5
keyv: 4.5.2
p-timeout: 6.1.0
quick-lru: 6.1.1
@ -506,6 +506,10 @@ packages:
picomatch: 2.3.1
dev: true
/array-keyed-map/2.1.3:
resolution: {integrity: sha512-JIUwuFakO+jHjxyp4YgSiKXSZeC0U+R1jR94bXWBcVlFRBycqXlb+kH9JHxBGcxnVuSqx5bnn0Qz9xtSeKOjiA==}
dev: false
/array-union/2.1.0:
resolution: {integrity: sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==}
engines: {node: '>=8'}
@ -1376,8 +1380,11 @@ packages:
get-intrinsic: 1.2.0
dev: true
/gpt-3-encoder/1.1.4:
resolution: {integrity: sha512-fSQRePV+HUAhCn7+7HL7lNIXNm6eaFWFbNLOOGtmSJ0qJycyQvj60OvRlH7mee8xAMjBDNRdMXlMwjAbMTDjkg==}
/gpt3-tokenizer/1.1.5:
resolution: {integrity: sha512-O9iCL8MqGR0Oe9wTh0YftzIbysypNQmS5a5JG3cB3M4LMYjlAVvNnf8LUzVY9MrI7tj+YLY356uHtO2lLX2HpA==}
engines: {node: '>=12'}
dependencies:
array-keyed-map: 2.1.3
dev: false
/graceful-fs/4.2.10:

Wyświetl plik

@ -1,9 +1,9 @@
import { encode as gptEncode } from 'gpt-3-encoder'
import Keyv from 'keyv'
import pTimeout from 'p-timeout'
import QuickLRU from 'quick-lru'
import { v4 as uuidv4 } from 'uuid'
import * as tokenizer from './tokenizer'
import * as types from './types'
import { fetch as globalFetch } from './fetch'
import { fetchSSE } from './fetch-sse'
@ -438,7 +438,7 @@ Current date: ${currentDate}${this._sepToken}\n\n`
text = text.replace(/<\|im_sep\|>/g, '<|endoftext|>')
}
return gptEncode(text).length
return tokenizer.encode(text).length
}
protected get _isChatGPTModel() {

12
src/tokenizer.ts 100644
Wyświetl plik

@ -0,0 +1,12 @@
import GPT3TokenizerImport from 'gpt3-tokenizer'
const GPT3Tokenizer: typeof GPT3TokenizerImport =
typeof GPT3TokenizerImport === 'function'
? GPT3TokenizerImport
: (GPT3TokenizerImport as any).default
export const tokenizer = new GPT3Tokenizer({ type: 'gpt3' })
export function encode(input: string): number[] {
return tokenizer.encode(input).bpe
}