Skip to content

Commit 2051a29

Browse files
committed
new file for hugging face training openwebtext
1 parent cb738d9 commit 2051a29

11 files changed

+480
-146
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ umb/www.umb.edu
22
umb/umb_data
33
umb/test-env
44
gpt/models
5-
gpt/data/ubuntu_dataset
5+
test-env
6+
gpt/data/*
607 Bytes
Binary file not shown.
472 Bytes
Binary file not shown.
15.3 KB
Binary file not shown.
22.2 KB
Binary file not shown.

gpt/lm_config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@
1313
TOKENIZER_MODEL="o200k_base"
1414

1515
# Data Config
16-
TRAIN_DATA_PATH="data/threebody.txt"
16+
TRAIN_DATA_PATH="data/openwebtext/test.hf"

gpt/lm_model.py

+4-144
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
from tqdm import tqdm
1515
import argparse
1616
import optparse
17-
from .lm_config import *
17+
from lm_config import *
18+
import os
19+
import datasets
1820

1921
# Load the default model configuration
2022
LM_MODEL_CONFIG = [
@@ -283,6 +285,7 @@ def __init__(self, vocab_size, embedding_size, batch_size, block_size, learning_
283285
self.blocks = nn.Sequential(*[TransformerBlock(embedding_size, embedding_size, head_count, block_size, dropout) for _ in range(layer_count)])
284286
self.layer_norm = nn.LayerNorm(embedding_size, device=device)
285287
self.lm_head = nn.Linear(embedding_size, vocab_size, bias=False, device=device)
288+
self.current_index = 0
286289

287290
def forward(self, idx, targets=None):
288291
"""
@@ -333,147 +336,4 @@ def generate(self, idx, max_new_tokens):
333336
idx = torch.cat((idx, idx_next), dim=1)
334337
return idx
335338

336-
def train_model(self, tokens, eval_iters=200, training_val_ratio=0.8, loss_report_interval=500):
337-
"""
338-
Built-in unit test for training the model on a dataset reporting the training and validation loss
339-
340-
Parameters
341-
----------
342-
tokens : torch.Tensor
343-
The dataset of tokens
344-
eval_iters : int, optional
345-
The number of iterations to estimate the loss
346-
training_val_ratio : float, optional
347-
The ratio of the dataset to use for training (lower ratio means more data for validation)
348-
loss_report_interval : int, optional
349-
The interval to report the training and validation loss
350-
"""
351-
training_tokens = tokens[:floor(len(tokens)*training_val_ratio)]
352-
validation_tokens = tokens[floor(len(tokens)*training_val_ratio):]
353-
optimizer = adamw.AdamW(self.parameters(), lr=self.learning_rate)
354-
for step in tqdm(range(self.steps)):
355-
optimizer.zero_grad()
356-
s, t = sample(training_tokens, 4, 8)
357-
logits, loss = lm(s, t)
358-
loss.backward()
359-
optimizer.step()
360-
if step % loss_report_interval == 0:
361-
losses = self._estimate_loss(eval_iters, training_tokens, validation_tokens)
362-
print(f"step {step}: train loss {losses[0]:.4f}, val loss {losses[1]:.4f}")
363-
364-
@torch.no_grad()
365-
def _estimate_loss(self, eval_iters, training_data, validation_data):
366-
"""
367-
Returns the loss of the model on a training and validation dataset
368-
369-
Parameters
370-
----------
371-
eval_iters : int
372-
The number of iterations to estimate the loss
373-
training_data : torch.Tensor
374-
The training dataset [B x T] where B is the batch size and T is the number of tokens in a block
375-
validation_data : torch.Tensor
376-
The validation dataset [B x T]
377-
"""
378-
out = {}
379-
# Disable dropout and layer normalization before model validation
380-
self.eval()
381-
for i, split in enumerate([training_data, validation_data]):
382-
losses = torch.zeros(eval_iters)
383-
for k in range(eval_iters):
384-
X, Y = sample(split, self.batch_size, self.block_size)
385-
logits, loss = self(X, Y)
386-
losses[k] = loss.item()
387-
out[i] = losses.mean()
388-
# Enable dropout and layer normalization after model validation
389-
self.train()
390-
return out
391-
392-
# python lm_model.py -t tiktoken -m o200k_base -s models/threebody/200k_base -d data/threebody.txt 384 64 256 3e-4 5000 6 6 0.2
393-
def sample(data, batch_size, block_size):
394-
starting_indices = torch.randint(len(data) - block_size, (batch_size,))
395-
sample = torch.stack([data[start_idx:start_idx+block_size] for start_idx in starting_indices])
396-
target = torch.stack([data[start_idx+1:start_idx+block_size+1] for start_idx in starting_indices])
397-
return sample, target
398-
399-
def useTiktoken(filename, model_name="o200k_base"):
400-
tokenizer = tiktoken.get_encoding(model_name)
401-
assert tokenizer.decode(tokenizer.encode("hello world")) == "hello world"
402-
with open(filename) as f:
403-
tokens = torch.tensor(tokenizer.encode(f.read()), dtype=torch.long, device=device)
404-
405-
return tokenizer, tokens, tokenizer.n_vocab
406-
407-
def useLocal(filename, model_name="tokenizer_models/umb100k-1.model"):
408-
tokenizer = Tokenizer()
409-
tokenizer.load(model_name)
410-
assert tokenizer.decode(tokenizer.encode("hello world")) == "hello world"
411-
with open(filename) as f:
412-
tokens = torch.tensor(tokenizer.encode(f.read()), dtype=torch.long, device=device)
413-
414-
return tokenizer, tokens, len(tokenizer._vocab)
415-
416-
417-
if __name__ == "__main__":
418-
parser=argparse.ArgumentParser(
419-
description="""Train a language model on a dataset and generate text""")
420-
parser.add_argument('-t', '--tokenizer', type=str, default=TOKENIZER_NAME, help=f'Specify the tokenizer to use (default: {TOKENIZER_NAME})')
421-
parser.add_argument('-m', '--tokenizer_model', type=str, default=TOKENIZER_MODEL, help=f'Specify the tokenizer model to use (default: {TOKENIZER_MODEL})')
422-
parser.add_argument('-l', '--load_model', type=str, default="untrained", help='Specify the model to use [model_path] (default: untrained)')
423-
parser.add_argument('-s', '--save_model', type=str, default="default", help='Specify the model to save the model to [model_path] (default: same as load_model path, no_save: do not save model)')
424-
parser.add_argument('-d', '--data', type=str, default=TRAIN_DATA_PATH, help=f'Specify the data to use for training (default: {TRAIN_DATA_PATH})')
425-
parser.add_argument('--no_train', type=bool, default=False, help='Do not train the model')
426-
parser.add_argument('params', nargs='*', default=LM_MODEL_CONFIG, help=f'Training parameters for the model [embedding_size, batch_size, block_size, learning_rate, steps, head_count, layer_count, dropout]\n(default: {LM_MODEL_CONFIG})')
427-
# python
428-
args=parser.parse_args()
429-
print(args)
430-
if args.tokenizer == "tokenizer":
431-
tokenizer, tokens, vocab_size = useLocal(args.data, args.tokenizer_model)
432-
elif args.tokenizer == "tiktoken":
433-
tokenizer, tokens, vocab_size = useTiktoken(args.data, args.tokenizer_model)
434-
else:
435-
print("Invalid tokenizer: must be either 'tokenizer' or 'tiktoken'")
436-
exit()
437-
438-
lm = LanguageModel(
439-
vocab_size=vocab_size,
440-
embedding_size=int(args.params[0]),
441-
batch_size=int(args.params[1]),
442-
block_size=int(args.params[2]),
443-
learning_rate=float(args.params[3]),
444-
steps=int(args.params[4]),
445-
head_count=int(args.params[5]),
446-
layer_count=int(args.params[6]),
447-
dropout=float(args.params[7])
448-
)
449-
450-
if args.load_model != "untrained":
451-
try:
452-
lm.load_state_dict(torch.load(args.load_model))
453-
except:
454-
print("Error: Model not found")
455-
exit()
456-
else:
457-
print("Warning: Using untrained model")
458-
459-
if not args.no_train:
460-
lm.train_model(tokens)
461-
462-
start_idx, _ = sample(tokens, lm.batch_size, lm.block_size)
463-
outputs = lm.generate(start_idx, max_new_tokens=400)[0].tolist()
464-
print(f"Prompt:\n{tokenizer.decode(start_idx[0].tolist())}\nGenerated Response:\n{tokenizer.decode(outputs)}")
465-
466-
if args.save_model == "default":
467-
if args.load_model == "untrained":
468-
print("Warning: Model not saved")
469-
else:
470-
torch.save(lm.state_dict(), args.load_model)
471-
elif args.save_model == "no_save":
472-
print("Warning: Model not saved")
473-
else:
474-
torch.save(lm.state_dict(), args.save_model)
475-
476-
477-
478-
479339

gpt/lm_model_train.py

+185
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
from tokenizer import Tokenizer
2+
import torch
3+
from torch.optim import adamw
4+
import tiktoken
5+
from math import floor
6+
from tqdm import tqdm
7+
import argparse
8+
from lm_config import *
9+
import os
10+
from lm_model import LanguageModel
11+
12+
# Load the default model configuration
13+
LM_MODEL_CONFIG = [
14+
EMBEDDING_SIZE,
15+
BATCH_SIZE,
16+
BLOCK_SIZE,
17+
LEARNING_RATE,
18+
STEPS,
19+
HEAD_COUNT,
20+
LAYER_COUNT,
21+
DROPOUT
22+
]
23+
24+
# Set the device to use for training
25+
device = "cuda" if torch.cuda.is_available() else "cpu"
26+
if device == "cpu":
27+
print("Warning: Using CPU for training; consider using a GPU for faster training")
28+
29+
def sample(data, batch_size, block_size):
30+
starting_indices = torch.randint(len(data) - block_size, (batch_size,))
31+
sample = torch.stack([data[start_idx:start_idx+block_size] for start_idx in starting_indices])
32+
target = torch.stack([data[start_idx+1:start_idx+block_size+1] for start_idx in starting_indices])
33+
return sample, target
34+
35+
def train(model, tokens, eval_iters=200, training_val_ratio=0.8, loss_report_interval=500):
36+
"""
37+
Built-in unit test for training the model on a dataset reporting the training and validation loss
38+
39+
Parameters
40+
----------
41+
tokens : torch.Tensor
42+
The dataset of tokens
43+
eval_iters : int, optional
44+
The number of iterations to estimate the loss
45+
training_val_ratio : float, optional
46+
The ratio of the dataset to use for training (lower ratio means more data for validation)
47+
loss_report_interval : int, optional
48+
The interval to report the training and validation loss
49+
"""
50+
51+
training_tokens = tokens[:floor(len(tokens)*training_val_ratio)]
52+
validation_tokens = tokens[floor(len(tokens)*training_val_ratio):]
53+
54+
optimizer = adamw.AdamW(model.parameters(), lr=model.learning_rate)
55+
loader = tqdm(total=model.steps)
56+
for step in range(model.steps):
57+
optimizer.zero_grad()
58+
s, t = sample(training_tokens, 4, 8)
59+
logits, loss = lm(s, t)
60+
loss.backward()
61+
optimizer.step()
62+
if step % loss_report_interval == 0:
63+
losses = model._estimate_loss(eval_iters, training_tokens, validation_tokens)
64+
loader.set_description(f"Step {step}: train loss {losses[0]:.4f}, val loss {losses[1]:.4f}")
65+
66+
loader.update()
67+
68+
loader.close()
69+
70+
@torch.no_grad()
71+
def _estimate_loss(model, eval_iters, training_data, validation_data):
72+
"""
73+
Returns the loss of the model on a training and validation dataset
74+
75+
Parameters
76+
----------
77+
eval_iters : int
78+
The number of iterations to estimate the loss
79+
training_data : torch.Tensor
80+
The training dataset [B x T] where B is the batch size and T is the number of tokens in a block
81+
validation_data : torch.Tensor
82+
The validation dataset [B x T]
83+
"""
84+
out = {}
85+
# Disable dropout and layer normalization before model validation
86+
model.eval()
87+
for i, split in enumerate([training_data, validation_data]):
88+
losses = torch.zeros(eval_iters)
89+
for k in range(eval_iters):
90+
X, Y = sample(split, model.batch_size, model.block_size)
91+
logits, loss = model(X, Y)
92+
losses[k] = loss.item()
93+
out[i] = losses.mean()
94+
# Enable dropout and layer normalization after model validation
95+
model.train()
96+
return out
97+
98+
def useTiktoken(filename, model_name="o200k_base"):
99+
tokenizer = tiktoken.get_encoding(model_name)
100+
assert tokenizer.decode(tokenizer.encode("hello world")) == "hello world"
101+
with open(filename) as f:
102+
tokens = torch.tensor(tokenizer.encode(f.read()), dtype=torch.long, device=device)
103+
104+
return tokenizer, tokens, tokenizer.n_vocab
105+
106+
def useLocal(filename, model_name="tokenizer_models/umb100k-1.model"):
107+
tokenizer = Tokenizer()
108+
tokenizer.load(model_name)
109+
assert tokenizer.decode(tokenizer.encode("hello world")) == "hello world"
110+
with open(filename) as f:
111+
tokens = torch.tensor(tokenizer.encode(f.read()), dtype=torch.long, device=device)
112+
113+
return tokenizer, tokens, len(tokenizer._vocab)
114+
115+
116+
if __name__ == "__main__":
117+
parser=argparse.ArgumentParser(
118+
description="""Train a language model on a dataset and generate text""")
119+
parser.add_argument('-t', '--tokenizer', type=str, default=TOKENIZER_NAME, help=f'Specify the tokenizer to use (default: {TOKENIZER_NAME})')
120+
parser.add_argument('-m', '--tokenizer_model', type=str, default=TOKENIZER_MODEL, help=f'Specify the tokenizer model to use (default: {TOKENIZER_MODEL})')
121+
parser.add_argument('-l', '--load_model', type=str, default="untrained", help='Specify the model to use [model_path] (default: untrained)')
122+
parser.add_argument('-s', '--save_model', type=str, default="default", help='Specify the model to save the model to [model_path] (default: same as load_model path, no_save: do not save model)')
123+
parser.add_argument('-d', '--data', type=str, default=TRAIN_DATA_PATH, help=f'Specify the data to use for training (default: {TRAIN_DATA_PATH})')
124+
parser.add_argument('--no_train', type=bool, default=False, help='Do not train the model')
125+
parser.add_argument('params', nargs='*', default=LM_MODEL_CONFIG, help=f'Training parameters for the model [embedding_size, batch_size, block_size, learning_rate, steps, head_count, layer_count, dropout]\n(default: {LM_MODEL_CONFIG})')
126+
# python
127+
args=parser.parse_args()
128+
print(args)
129+
130+
131+
132+
if not os.path.exists(args.data):
133+
print("Error: Data path does not exist. Exiting.")
134+
exit()
135+
136+
137+
if args.tokenizer == "tokenizer":
138+
tokenizer, tokens, vocab_size = useLocal(args.data, args.tokenizer_model)
139+
elif args.tokenizer == "tiktoken":
140+
tokenizer, tokens, vocab_size = useTiktoken(args.data, args.tokenizer_model)
141+
else:
142+
print("Invalid tokenizer: must be either 'tokenizer' or 'tiktoken'")
143+
exit()
144+
145+
lm = LanguageModel(
146+
vocab_size=vocab_size,
147+
embedding_size=int(args.params[0]),
148+
batch_size=int(args.params[1]),
149+
block_size=int(args.params[2]),
150+
learning_rate=float(args.params[3]),
151+
steps=int(args.params[4]),
152+
head_count=int(args.params[5]),
153+
layer_count=int(args.params[6]),
154+
dropout=float(args.params[7])
155+
)
156+
157+
if args.load_model != "untrained":
158+
try:
159+
lm.load_state_dict(torch.load(args.load_model))
160+
except:
161+
print("Error: Model not found")
162+
exit()
163+
else:
164+
print("Warning: Using untrained model")
165+
166+
if not args.no_train:
167+
lm.train_model(tokens)
168+
169+
170+
171+
start_idx, _ = sample(tokens, lm.batch_size, lm.block_size)
172+
outputs = lm.generate(start_idx, max_new_tokens=400)[0].tolist()
173+
print(f"Prompt:\n{tokenizer.decode(start_idx[0].tolist())}\nGenerated Response:\n{tokenizer.decode(outputs)}")
174+
175+
if args.save_model == "default":
176+
if args.load_model == "untrained":
177+
print("Warning: Model not saved")
178+
else:
179+
torch.save(lm.state_dict(), args.load_model)
180+
elif args.save_model == "no_save":
181+
print("Warning: Model not saved")
182+
else:
183+
torch.save(lm.state_dict(), args.save_model)
184+
185+

0 commit comments

Comments
 (0)