Some quality of life improvements.

2023-02-17 10:27:13 -05:00 · 2023-02-17 10:27:13 -05:00 · dfb5df895a
commit dfb5df895a
--- a/README.md
+++ b/README.md
@ -29,9 +29,6 @@ A quick breakdown of each of the files:
 ```bash
 pip install -r requirements.txt
 ```
-
-If you're using an M1 Macbook, you'll need to replace `tensorflow` with `tensorflow-macos`.
-
 Tested on `Python 3.9.10`.

 #### Usage
--- a/gpt2.py
+++ b/gpt2.py
@ -46,7 +46,7 @@ def mha(x, c_attn, c_proj, n_head):  # [n_seq, n_embd] -> [n_seq, n_embd]
    qkv_heads = list(map(lambda x: np.split(x, n_head, axis=-1), qkv))  # [3, n_seq, n_embd] -> [3, n_head, n_seq, n_embd/n_head]

    # causal mask to hide future inputs from being attended to
-    causal_mask = (1 - np.tri(x.shape[0], dtype=np.float32)) * -1e10  # [n_seq, n_seq]
+    causal_mask = (1 - np.tri(x.shape[0], dtype=x.dtype)) * -1e10  # [n_seq, n_seq]

    # perform attention over each head
    out_heads = [attention(q, k, v, causal_mask) for q, k, v in zip(*qkv_heads)]  # [3, n_head, n_seq, n_embd/n_head] -> [n_head, n_seq, n_embd/n_head]
--- a/gpt2_pico.py
+++ b/gpt2_pico.py
@ -24,7 +24,7 @@ def attention(q, k, v, mask):
 def mha(x, c_attn, c_proj, n_head):
    x = linear(x, **c_attn)
    qkv_heads = list(map(lambda x: np.split(x, n_head, axis=-1), np.split(x, 3, axis=-1)))
-    causal_mask = (1 - np.tri(x.shape[0], dtype=np.float32)) * -1e10
+    causal_mask = (1 - np.tri(x.shape[0], dtype=x.dtype)) * -1e10
    out_heads = [attention(q, k, v, causal_mask) for q, k, v in zip(*qkv_heads)]
    x = linear(np.hstack(out_heads), **c_proj)
    return x
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,10 @@
 numpy==1.24.1 # used for the actual model code/weights
 regex==2017.4.5 # used by the bpe tokenizer
 requests==2.27.1 # used to download gpt-2 files from openai
-tensorflow==2.11.0 # used to load the gpt-2 weights from the open-ai tf checkpoint
 tqdm==4.64.0 # progress bar to keep your sanity
 fire==0.5.0 # easy CLI creation
+
+# used to load the gpt-2 weights from the open-ai tf checkpoint
+# M1 Macbooks require tensorflow-macos
+tensorflow==2.11.0; sys_platform != 'darwin' or platform_machine != 'arm64'
+tensorflow-macos==2.11.0; sys_platform == 'darwin' and platform_machine == 'arm64'
--- a/utils.py
+++ b/utils.py
@ -49,11 +49,10 @@ def load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams):
        d[keys[0]] = set_in_nested_dict(d[keys[0]], keys[1:], val)
        return d

-    init_vars = tf.train.list_variables(tf_ckpt_path)
    params = {"blocks": [{} for _ in range(hparams["n_layer"])]}
-    for name, _ in init_vars:
+    for name, _ in tf.train.list_variables(tf_ckpt_path):
        array = np.squeeze(tf.train.load_variable(tf_ckpt_path, name))
-        name = name.removeprefix("model/")
+        name = name[len("model/") :]
        if name.startswith("h"):
            m = re.match(r"h([0-9]+)/(.*)", name)
            n = int(m[1])