From dbb5320ece9c5f8ba20eadba5cb56395c4a323e8 Mon Sep 17 00:00:00 2001
From: Kraxner Thomas <thomas.kraxner@illwerkevkw.at>
Date: Mon, 3 Apr 2023 13:24:35 +0200
Subject: [PATCH] Fix of 2 typos

---
 encoder.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/encoder.py b/encoder.py
index 2569b81..7f7ac25 100644
--- a/encoder.py
+++ b/encoder.py
@@ -16,7 +16,7 @@ def bytes_to_unicode():
     The reversible bpe codes work on unicode strings.
     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     And avoids mapping to whitespace/control characters the bpe code barfs on.
     """
@@ -54,7 +54,7 @@ class Encoder:
         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
         self.cache = {}
 
-        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
     def bpe(self, token):