fix: tokenizer special tokens

pull/406/head
Travis Fischer 2023-02-28 20:44:48 -06:00
rodzic 05eac22e9f
commit e3ee7272da
1 zmienionych plików z 3 dodań i 2 usunięć

Wyświetl plik

@ -434,10 +434,11 @@ Current date: ${currentDate}${this._sepToken}\n\n`
if (this._isChatGPTModel) {
// With this model, "<|im_end|>" is 1 token, but tokenizers aren't aware of it yet.
// Replace it with "<|endoftext|>" (which it does know about) so that the tokenizer can count it as 1 token.
text = text.replace(/<\|im_end\|>/g, '<|endoftext|>')
text = text.replace(/<\|im_sep\|>/g, '<|endoftext|>')
// text = text.replace(/<\|im_end\|>/g, '<|endoftext|>')
// text = text.replace(/<\|im_sep\|>/g, '<|endoftext|>')
}
text = text.replace(/<\|endoftext\|>/g, '')
return tokenizer.encode(text).length
}