From dbb5320ece9c5f8ba20eadba5cb56395c4a323e8 Mon Sep 17 00:00:00 2001 From: Kraxner Thomas Date: Mon, 3 Apr 2023 13:24:35 +0200 Subject: [PATCH] Fix of 2 typos --- encoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/encoder.py b/encoder.py index 2569b81..7f7ac25 100644 --- a/encoder.py +++ b/encoder.py @@ -16,7 +16,7 @@ def bytes_to_unicode(): The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a signficant percentage of your normal, say, 32K bpe vocab. + This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ @@ -54,7 +54,7 @@ class Encoder: self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} - # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") def bpe(self, token):