From 3ec62b89b290debc5a3ae104b398d6783ea2663f Mon Sep 17 00:00:00 2001 From: Claudio Poli Date: Wed, 17 May 2023 04:52:37 +0200 Subject: [PATCH] switch tokenizer implementation with pure js and more compatible js-tiktoken --- package.json | 2 +- pnpm-lock.yaml | 17 +++++++++-------- src/tokenizer.ts | 6 +++--- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/package.json b/package.json index 24233c2..dc5128e 100644 --- a/package.json +++ b/package.json @@ -37,10 +37,10 @@ "test:prettier": "prettier '**/*.{js,jsx,ts,tsx}' --check" }, "dependencies": { - "@dqbd/tiktoken": "^1.0.7", "cac": "^6.7.14", "conf": "^11.0.1", "eventsource-parser": "^1.0.0", + "js-tiktoken": "^1.0.5", "keyv": "^4.5.2", "p-timeout": "^6.1.1", "quick-lru": "^6.1.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 62f8f01..c8383ee 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1,9 +1,6 @@ lockfileVersion: '6.0' dependencies: - '@dqbd/tiktoken': - specifier: ^1.0.7 - version: 1.0.7 cac: specifier: ^6.7.14 version: 6.7.14 @@ -13,6 +10,9 @@ dependencies: eventsource-parser: specifier: ^1.0.0 version: 1.0.0 + js-tiktoken: + specifier: ^1.0.5 + version: 1.0.5 keyv: specifier: ^4.5.2 version: 4.5.2 @@ -192,10 +192,6 @@ packages: to-fast-properties: 2.0.0 dev: true - /@dqbd/tiktoken@1.0.7: - resolution: {integrity: sha512-bhR5k5W+8GLzysjk8zTMVygQZsgvf7W1F0IlL4ZQ5ugjo5rCyiwGM5d8DYriXspytfu98tv59niang3/T+FoDw==} - dev: false - /@esbuild-kit/cjs-loader@2.4.2: resolution: {integrity: sha512-BDXFbYOJzT/NBEtp71cvsrGPwGAMGRB/349rwKuoxNSiKjPraNNnlK6MIIabViCjqZugu6j+xeMDlEkWdHHJSg==} dependencies: @@ -809,7 +805,6 @@ packages: /base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} - dev: true /binary-extensions@2.2.0: resolution: {integrity: sha512-jDctJ/IVQbZoJykoeHbhXpOlNBqGNcwXJKJog42E5HDPUwQTSdjCHdihjj0DlnheQ7blbT6dHOafNAiS8ooQKA==} @@ -1831,6 +1826,12 @@ packages: engines: {node: '>=10'} dev: true + /js-tiktoken@1.0.5: + resolution: {integrity: sha512-RYXe54ntls/uQmAxUua2J1+g+EiwWHGn1CxfioYxrP1iVDmksfZsyJt0VySyMNbreJyyreDtyBuBxeXy7HYqjQ==} + dependencies: + base64-js: 1.5.1 + dev: false + /js-tokens@4.0.0: resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==} diff --git a/src/tokenizer.ts b/src/tokenizer.ts index dcf2e22..87cc375 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -1,8 +1,8 @@ -import { get_encoding } from '@dqbd/tiktoken' +import { getEncoding } from 'js-tiktoken' // TODO: make this configurable -const tokenizer = get_encoding('cl100k_base') +const tokenizer = getEncoding('cl100k_base') export function encode(input: string): Uint32Array { - return tokenizer.encode(input) + return new Uint32Array(tokenizer.encode(input)) }