kopia lustrzana https://github.com/jaymody/picoGPT
Some quality of life improvements.
rodzic
03d958f892
commit
dfb5df895a
|
@ -29,9 +29,6 @@ A quick breakdown of each of the files:
|
||||||
```bash
|
```bash
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're using an M1 Macbook, you'll need to replace `tensorflow` with `tensorflow-macos`.
|
|
||||||
|
|
||||||
Tested on `Python 3.9.10`.
|
Tested on `Python 3.9.10`.
|
||||||
|
|
||||||
#### Usage
|
#### Usage
|
||||||
|
|
2
gpt2.py
2
gpt2.py
|
@ -46,7 +46,7 @@ def mha(x, c_attn, c_proj, n_head): # [n_seq, n_embd] -> [n_seq, n_embd]
|
||||||
qkv_heads = list(map(lambda x: np.split(x, n_head, axis=-1), qkv)) # [3, n_seq, n_embd] -> [3, n_head, n_seq, n_embd/n_head]
|
qkv_heads = list(map(lambda x: np.split(x, n_head, axis=-1), qkv)) # [3, n_seq, n_embd] -> [3, n_head, n_seq, n_embd/n_head]
|
||||||
|
|
||||||
# causal mask to hide future inputs from being attended to
|
# causal mask to hide future inputs from being attended to
|
||||||
causal_mask = (1 - np.tri(x.shape[0], dtype=np.float32)) * -1e10 # [n_seq, n_seq]
|
causal_mask = (1 - np.tri(x.shape[0], dtype=x.dtype)) * -1e10 # [n_seq, n_seq]
|
||||||
|
|
||||||
# perform attention over each head
|
# perform attention over each head
|
||||||
out_heads = [attention(q, k, v, causal_mask) for q, k, v in zip(*qkv_heads)] # [3, n_head, n_seq, n_embd/n_head] -> [n_head, n_seq, n_embd/n_head]
|
out_heads = [attention(q, k, v, causal_mask) for q, k, v in zip(*qkv_heads)] # [3, n_head, n_seq, n_embd/n_head] -> [n_head, n_seq, n_embd/n_head]
|
||||||
|
|
|
@ -24,7 +24,7 @@ def attention(q, k, v, mask):
|
||||||
def mha(x, c_attn, c_proj, n_head):
|
def mha(x, c_attn, c_proj, n_head):
|
||||||
x = linear(x, **c_attn)
|
x = linear(x, **c_attn)
|
||||||
qkv_heads = list(map(lambda x: np.split(x, n_head, axis=-1), np.split(x, 3, axis=-1)))
|
qkv_heads = list(map(lambda x: np.split(x, n_head, axis=-1), np.split(x, 3, axis=-1)))
|
||||||
causal_mask = (1 - np.tri(x.shape[0], dtype=np.float32)) * -1e10
|
causal_mask = (1 - np.tri(x.shape[0], dtype=x.dtype)) * -1e10
|
||||||
out_heads = [attention(q, k, v, causal_mask) for q, k, v in zip(*qkv_heads)]
|
out_heads = [attention(q, k, v, causal_mask) for q, k, v in zip(*qkv_heads)]
|
||||||
x = linear(np.hstack(out_heads), **c_proj)
|
x = linear(np.hstack(out_heads), **c_proj)
|
||||||
return x
|
return x
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
numpy==1.24.1 # used for the actual model code/weights
|
numpy==1.24.1 # used for the actual model code/weights
|
||||||
regex==2017.4.5 # used by the bpe tokenizer
|
regex==2017.4.5 # used by the bpe tokenizer
|
||||||
requests==2.27.1 # used to download gpt-2 files from openai
|
requests==2.27.1 # used to download gpt-2 files from openai
|
||||||
tensorflow==2.11.0 # used to load the gpt-2 weights from the open-ai tf checkpoint
|
|
||||||
tqdm==4.64.0 # progress bar to keep your sanity
|
tqdm==4.64.0 # progress bar to keep your sanity
|
||||||
fire==0.5.0 # easy CLI creation
|
fire==0.5.0 # easy CLI creation
|
||||||
|
|
||||||
|
# used to load the gpt-2 weights from the open-ai tf checkpoint
|
||||||
|
# M1 Macbooks require tensorflow-macos
|
||||||
|
tensorflow==2.11.0; sys_platform != 'darwin' or platform_machine != 'arm64'
|
||||||
|
tensorflow-macos==2.11.0; sys_platform == 'darwin' and platform_machine == 'arm64'
|
||||||
|
|
5
utils.py
5
utils.py
|
@ -49,11 +49,10 @@ def load_gpt2_params_from_tf_ckpt(tf_ckpt_path, hparams):
|
||||||
d[keys[0]] = set_in_nested_dict(d[keys[0]], keys[1:], val)
|
d[keys[0]] = set_in_nested_dict(d[keys[0]], keys[1:], val)
|
||||||
return d
|
return d
|
||||||
|
|
||||||
init_vars = tf.train.list_variables(tf_ckpt_path)
|
|
||||||
params = {"blocks": [{} for _ in range(hparams["n_layer"])]}
|
params = {"blocks": [{} for _ in range(hparams["n_layer"])]}
|
||||||
for name, _ in init_vars:
|
for name, _ in tf.train.list_variables(tf_ckpt_path):
|
||||||
array = np.squeeze(tf.train.load_variable(tf_ckpt_path, name))
|
array = np.squeeze(tf.train.load_variable(tf_ckpt_path, name))
|
||||||
name = name.removeprefix("model/")
|
name = name[len("model/") :]
|
||||||
if name.startswith("h"):
|
if name.startswith("h"):
|
||||||
m = re.match(r"h([0-9]+)/(.*)", name)
|
m = re.match(r"h([0-9]+)/(.*)", name)
|
||||||
n = int(m[1])
|
n = int(m[1])
|
||||||
|
|
Ładowanie…
Reference in New Issue