From d8eeb1a736fac122134172800db29b7c9a3f4da3 Mon Sep 17 00:00:00 2001 From: Travis Fischer Date: Sun, 19 Feb 2023 03:48:06 -0600 Subject: [PATCH] feat: switch from gpt-3-encoder to gpt3-tokenizer --- package.json | 2 +- pnpm-lock.yaml | 15 +++++++++++---- src/chatgpt-api.ts | 4 ++-- src/tokenizer.ts | 12 ++++++++++++ 4 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 src/tokenizer.ts diff --git a/package.json b/package.json index e04a532..6db3d40 100644 --- a/package.json +++ b/package.json @@ -36,7 +36,7 @@ }, "dependencies": { "eventsource-parser": "^0.0.5", - "gpt-3-encoder": "^1.1.4", + "gpt3-tokenizer": "^1.1.5", "keyv": "^4.5.2", "p-timeout": "^6.0.0", "quick-lru": "^6.1.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 70c1242..8a65a24 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,7 +8,7 @@ specifiers: del-cli: ^5.0.0 dotenv-safe: ^8.2.0 eventsource-parser: ^0.0.5 - gpt-3-encoder: ^1.1.4 + gpt3-tokenizer: ^1.1.5 husky: ^8.0.2 keyv: ^4.5.2 lint-staged: ^13.0.3 @@ -26,7 +26,7 @@ specifiers: dependencies: eventsource-parser: 0.0.5 - gpt-3-encoder: 1.1.4 + gpt3-tokenizer: 1.1.5 keyv: 4.5.2 p-timeout: 6.1.0 quick-lru: 6.1.1 @@ -506,6 +506,10 @@ packages: picomatch: 2.3.1 dev: true + /array-keyed-map/2.1.3: + resolution: {integrity: sha512-JIUwuFakO+jHjxyp4YgSiKXSZeC0U+R1jR94bXWBcVlFRBycqXlb+kH9JHxBGcxnVuSqx5bnn0Qz9xtSeKOjiA==} + dev: false + /array-union/2.1.0: resolution: {integrity: sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==} engines: {node: '>=8'} @@ -1376,8 +1380,11 @@ packages: get-intrinsic: 1.2.0 dev: true - /gpt-3-encoder/1.1.4: - resolution: {integrity: sha512-fSQRePV+HUAhCn7+7HL7lNIXNm6eaFWFbNLOOGtmSJ0qJycyQvj60OvRlH7mee8xAMjBDNRdMXlMwjAbMTDjkg==} + /gpt3-tokenizer/1.1.5: + resolution: {integrity: sha512-O9iCL8MqGR0Oe9wTh0YftzIbysypNQmS5a5JG3cB3M4LMYjlAVvNnf8LUzVY9MrI7tj+YLY356uHtO2lLX2HpA==} + engines: {node: '>=12'} + dependencies: + array-keyed-map: 2.1.3 dev: false /graceful-fs/4.2.10: diff --git a/src/chatgpt-api.ts b/src/chatgpt-api.ts index 1bd32cf..f13fd30 100644 --- a/src/chatgpt-api.ts +++ b/src/chatgpt-api.ts @@ -1,9 +1,9 @@ -import { encode as gptEncode } from 'gpt-3-encoder' import Keyv from 'keyv' import pTimeout from 'p-timeout' import QuickLRU from 'quick-lru' import { v4 as uuidv4 } from 'uuid' +import * as tokenizer from './tokenizer' import * as types from './types' import { fetch as globalFetch } from './fetch' import { fetchSSE } from './fetch-sse' @@ -438,7 +438,7 @@ Current date: ${currentDate}${this._sepToken}\n\n` text = text.replace(/<\|im_sep\|>/g, '<|endoftext|>') } - return gptEncode(text).length + return tokenizer.encode(text).length } protected get _isChatGPTModel() { diff --git a/src/tokenizer.ts b/src/tokenizer.ts new file mode 100644 index 0000000..81209ed --- /dev/null +++ b/src/tokenizer.ts @@ -0,0 +1,12 @@ +import GPT3TokenizerImport from 'gpt3-tokenizer' + +const GPT3Tokenizer: typeof GPT3TokenizerImport = + typeof GPT3TokenizerImport === 'function' + ? GPT3TokenizerImport + : (GPT3TokenizerImport as any).default + +export const tokenizer = new GPT3Tokenizer({ type: 'gpt3' }) + +export function encode(input: string): number[] { + return tokenizer.encode(input).bpe +}