-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtransformer.py
More file actions
144 lines (116 loc) · 4.4 KB
/
transformer.py
File metadata and controls
144 lines (116 loc) · 4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from itertools import chain
import numpy as np
from tinygrad.tensor import Tensor
from tqdm import tqdm
from layers.attention import CausalSelfAttention
from layers.feedforward import Linear, SwiGLU
from layers.lookup import Embedding
from layers.norm import LayerNorm
from optimizers.sgd import SGDOptimizer, AdamOptimizer
from utils.dataloader import load_data
from utils.loss_functions import cross_entropy
class GPT:
"""
GPT model class with simple pos embs.
"""
def __init__(self, vocab_size, n_embed, n_head, n_layer, block_size):
self.token_embedding = Embedding(vocab_size, n_embed)
self.position_embedding = Embedding(block_size, n_embed)
self.layers = n_layer
self.ln_f = LayerNorm(n_embed)
self.output_head = Linear(n_embed, vocab_size)
self.blocks = [
(
LayerNorm(n_embed),
CausalSelfAttention(n_embed, n_head, block_size),
SwiGLU(n_embed, 4 * n_embed), # feed-forward expansion
)
for _ in range(self.layers)
]
def parameters(self):
for layer in self.blocks:
for param in layer:
yield from param.parameters()
yield from self.token_embedding.parameters()
yield from self.ln_f.parameters()
yield from self.output_head.parameters()
def __call__(self, idx, debug=False):
if debug:
import pdb
pdb.set_trace()
B, T = idx.shape # batch size, sequence length
tok_emb = self.token_embedding(idx) # token embeddings
x = tok_emb # setup residual stream
for ln, attn, swiglu in self.blocks:
x = x + attn(ln(x), debug=False) # attention block
x = x + swiglu((ln(x))) # feed-forward block
x = self.ln_f(x) # final layer norm
logits = self.output_head(x) # output logits
return logits
def train(
model, train_data, get_batch, optimizer, epochs=10, block_size=128, batch_size=32
):
for epoch in range(epochs):
losses = []
for _ in tqdm(
range(len(train_data) // (batch_size * block_size)),
desc=f"Epoch {epoch+1}/{epochs}",
):
x_batch, y_batch = get_batch(train_data, block_size, batch_size)
logits = model(x_batch)
loss = cross_entropy(logits, y_batch).mean()
loss.backward()
optimizer.step(debug = False)
optimizer.zero_grad()
losses.append(loss.item())
# print(loss.item())
print(f"Epoch {epoch+1} Loss: {np.mean(losses)}")
print(
generate(
model,
Tensor(np.array([[0]], dtype=np.int32)),
max_new_tokens=10,
block_size=block_size,
)
)
def generate(model, idx, max_new_tokens, block_size):
"""
Generate new tokens from the model given a starting sequence idx.
"""
for _ in range(max_new_tokens):
idx_cond = idx[:, -block_size:] # crop context to block_size
logits = model(idx_cond, debug=False) # (1, T, vocab_size)
logits = logits[:, -1, :] # (1, vocab_size), get logits for the last token
probs = logits.softmax(-1) # apply softmax to get probabilities
next_token = probs.multinomial(num_samples=1) # sample from the distribution
idx = Tensor(
np.concatenate([idx.numpy(), next_token.numpy()], axis=1)
) # append sampled token
print(itos[int(next_token.numpy()[0, 0])], end="", flush=True)
if __name__ == "__main__":
block_size = 128
batch_size = 32
n_embed = 32
n_head = 4
n_layer = 2
epochs = 10
get_batch, train_data, val_data, stoi, itos, vocab_size = load_data(
"data/alice_in_wonderland.txt", block_size, batch_size
)
model = GPT(vocab_size, n_embed, n_head, n_layer, block_size)
print(next(model.blocks[0][1].parameters()))
tracked = []
tracked_iter = []
tracked += list(chain.from_iterable(model.parameters()))
print("Length comparison:", len(tracked))
# import pdb;pdb.set_trace()
# print(next(layer.parameters() for layer in model.blocks))
# optimizer = SGDOptimizer(
# tracked,
# lr=1e-3,
# )
optimizer = AdamOptimizer(
tracked,
lr=1e-3,
)
train(model, train_data, get_batch, optimizer, epochs, block_size, batch_size)