diff --git a/encoder.py b/encoder.py index 2569b81..7f7ac25 100644 --- a/encoder.py +++ b/encoder.py @@ -16,7 +16,7 @@ def bytes_to_unicode(): The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a signficant percentage of your normal, say, 32K bpe vocab. + This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ @@ -54,7 +54,7 @@ class Encoder: self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} - # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") def bpe(self, token):