Mohammad Ibrahim commited on
Commit
5cf9256
·
1 Parent(s): 7f96ead

Add application file

Browse files
CodeFiles/classnotes.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
CodeFiles/input.txt ADDED
The diff for this file is too large to render. See raw diff
 
CodeFiles/train_get2-1.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import math
3
+ import time
4
+ import inspect
5
+ from dataclasses import dataclass
6
+ import torch
7
+ import torch.nn as nn
8
+ from torch.nn import functional as F
9
+
10
+
11
+ class CausalSelfAttention(nn.Module):
12
+
13
+ def __init__(self, config):
14
+ super().__init__()
15
+ assert config.n_embd % config.n_head == 0
16
+ # key, query, value projections for all heads, but in a batch
17
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
18
+ # output projection
19
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
20
+ # regularization
21
+ self.n_head = config.n_head
22
+ self.n_embd = config.n_embd
23
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
24
+
25
+ def forward(self, x):
26
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
27
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
28
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
29
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
30
+ qkv = self.c_attn(x)
31
+ q, k, v = qkv.split(self.n_embd, dim=2)
32
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
33
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
34
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+
36
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
37
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
38
+ att = F.softmax(att, dim=-1)
39
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
40
+
41
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
42
+ # output projection
43
+ y = self.c_proj(y)
44
+ return y
45
+
46
+
47
+ class MLP(nn.Module):
48
+
49
+ def __init__(self, config):
50
+ super().__init__()
51
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
52
+ self.gelu = nn.GELU(approximate='tanh')
53
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
54
+ self.c_proj.NANOGPT_SCALE_INIT = 1
55
+
56
+ def forward(self, x):
57
+ x = self.c_fc(x)
58
+ x = self.gelu(x)
59
+ x = self.c_proj(x)
60
+ return x
61
+
62
+ class Block(nn.Module):
63
+
64
+ def __init__(self, config):
65
+ super().__init__()
66
+ self.ln_1 = nn.LayerNorm(config.n_embd)
67
+ self.attn = CausalSelfAttention(config)
68
+ self.ln_2 = nn.LayerNorm(config.n_embd)
69
+ self.mlp = MLP(config)
70
+
71
+ def forward(self, x):
72
+ x = x + self.attn(self.ln_1(x))
73
+ x = x + self.mlp(self.ln_2(x))
74
+ return x
75
+
76
+
77
+ @dataclass
78
+ class GPTConfig:
79
+ block_size: int = 1024 # max sequence length
80
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
81
+ n_layer: int = 12 # number of layers
82
+ n_head: int = 12 # number of heads
83
+ n_embd: int = 768 # embedding dimension
84
+
85
+
86
+ class GPT(nn.Module):
87
+
88
+ def __init__(self, config):
89
+ super().__init__()
90
+ self.config = config
91
+
92
+ self.transformer = nn.ModuleDict(dict(
93
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
94
+ wpe = nn.Embedding(config.block_size, config.n_embd),
95
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
96
+ ln_f = nn.LayerNorm(config.n_embd),
97
+ ))
98
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
99
+
100
+ def forward(self, idx, targets=None):
101
+ # idx is of shape (B, T)
102
+ B, T = idx.size()
103
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
104
+ # forward the token and posisition embeddings
105
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
106
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
107
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
108
+ x = tok_emb + pos_emb
109
+ # forward the blocks of the transformer
110
+ for block in self.transformer.h:
111
+ x = block(x)
112
+ # forward the final layernorm and the classifier
113
+ x = self.transformer.ln_f(x)
114
+ logits = self.lm_head(x) # (B, T, vocab_size)
115
+ loss = None
116
+ if targets is not None:
117
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
118
+ return logits, loss
119
+
120
+ @classmethod
121
+ def from_pretrained(cls, model_type):
122
+ """Loads pretrained GPT-2 model weights from huggingface"""
123
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
124
+ from transformers import GPT2LMHeadModel
125
+ print("loading weights from pretrained gpt: %s" % model_type)
126
+
127
+ # n_layer, n_head and n_embd are determined from model_type
128
+ config_args = {
129
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
130
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
131
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
132
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
133
+ }[model_type]
134
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
135
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
136
+ # create a from-scratch initialized minGPT model
137
+ config = GPTConfig(**config_args)
138
+ model = GPT(config)
139
+ sd = model.state_dict()
140
+ sd_keys = sd.keys()
141
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
142
+
143
+ # init a huggingface/transformers model
144
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
145
+ sd_hf = model_hf.state_dict()
146
+
147
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
148
+ sd_keys_hf = sd_hf.keys()
149
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
150
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
151
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
152
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
153
+ # this means that we have to transpose these weights when we import them
154
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
155
+ for k in sd_keys_hf:
156
+ if any(k.endswith(w) for w in transposed):
157
+ # special treatment for the Conv1D weights we need to transpose
158
+ assert sd_hf[k].shape[::-1] == sd[k].shape
159
+ with torch.no_grad():
160
+ sd[k].copy_(sd_hf[k].t())
161
+ else:
162
+ # vanilla copy over the other parameters
163
+ assert sd_hf[k].shape == sd[k].shape
164
+ with torch.no_grad():
165
+ sd[k].copy_(sd_hf[k])
166
+
167
+ return model
168
+
169
+ model = GPT.from_pretrained('gpt2')
170
+ print("didn't crash yet!")
171
+ # STOP
172
+ num_return_sequences = 5
173
+ max_length = 30
174
+
175
+ model.eval()
176
+ model.to('cuda')
177
+
178
+ import tiktoken
179
+ enc = tiktoken.get_encoding('gpt2')
180
+ tokens = enc.encode("Hello, I'm a language model,")
181
+ tokens = torch.tensor(tokens, dtype= torch.long) # (8,) #check tiktoken app
182
+ tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8)
183
+ x = tokens.to('cuda')
184
+
185
+ torch.manual_seed(42)
186
+ torch.cuda.manual_seed(42)
187
+ while x.size(1) < max_length:
188
+ # forward the model to get the logits
189
+ with torch.no_grad():
190
+ logits = model(x)[0] # (B, T, vocab_size)
191
+ # take the logits at the last position
192
+ logits = logits[:, -1, :] # (B, vocab_size)
193
+ # get the probabilities
194
+ probs = F.softmax(logits, dim=-1)
195
+ # do top-k sampling of 50 (huggingface pipeline default)
196
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
197
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
198
+ # select a token from the top-k probabilities
199
+ # note: multinomial does not demand the input to sum to 1
200
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
201
+ # gather the corresponding indices
202
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
203
+ # append to the sequence
204
+ x = torch.cat((x, xcol), dim=1)
205
+
206
+ # print the generated text
207
+ for i in range(num_return_sequences):
208
+ tokens = x[i, :max_length].tolist()
209
+ decoded = enc.decode(tokens)
210
+ print(">", decoded)
CodeFiles/train_get2-2.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import math
3
+ import time
4
+ import inspect
5
+ from dataclasses import dataclass
6
+ import torch
7
+ import torch.nn as nn
8
+ from torch.nn import functional as F
9
+
10
+
11
+ class CausalSelfAttention(nn.Module):
12
+
13
+ def __init__(self, config):
14
+ super().__init__()
15
+ assert config.n_embd % config.n_head == 0
16
+ # key, query, value projections for all heads, but in a batch
17
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
18
+ # output projection
19
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
20
+ # regularization
21
+ self.n_head = config.n_head
22
+ self.n_embd = config.n_embd
23
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
24
+
25
+ def forward(self, x):
26
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
27
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
28
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
29
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
30
+ qkv = self.c_attn(x)
31
+ q, k, v = qkv.split(self.n_embd, dim=2)
32
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
33
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
34
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+
36
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
37
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
38
+ att = F.softmax(att, dim=-1)
39
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
40
+
41
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
42
+ # output projection
43
+ y = self.c_proj(y)
44
+ return y
45
+
46
+
47
+ class MLP(nn.Module):
48
+
49
+ def __init__(self, config):
50
+ super().__init__()
51
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
52
+ self.gelu = nn.GELU(approximate='tanh')
53
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
54
+ self.c_proj.NANOGPT_SCALE_INIT = 1
55
+
56
+ def forward(self, x):
57
+ x = self.c_fc(x)
58
+ x = self.gelu(x)
59
+ x = self.c_proj(x)
60
+ return x
61
+
62
+ class Block(nn.Module):
63
+
64
+ def __init__(self, config):
65
+ super().__init__()
66
+ self.ln_1 = nn.LayerNorm(config.n_embd)
67
+ self.attn = CausalSelfAttention(config)
68
+ self.ln_2 = nn.LayerNorm(config.n_embd)
69
+ self.mlp = MLP(config)
70
+
71
+ def forward(self, x):
72
+ x = x + self.attn(self.ln_1(x))
73
+ x = x + self.mlp(self.ln_2(x))
74
+ return x
75
+
76
+
77
+ @dataclass
78
+ class GPTConfig:
79
+ block_size: int = 1024 # max sequence length
80
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
81
+ n_layer: int = 12 # number of layers
82
+ n_head: int = 12 # number of heads
83
+ n_embd: int = 768 # embedding dimension
84
+
85
+
86
+ class GPT(nn.Module):
87
+
88
+ def __init__(self, config):
89
+ super().__init__()
90
+ self.config = config
91
+
92
+ self.transformer = nn.ModuleDict(dict(
93
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
94
+ wpe = nn.Embedding(config.block_size, config.n_embd),
95
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
96
+ ln_f = nn.LayerNorm(config.n_embd),
97
+ ))
98
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
99
+
100
+ def forward(self, idx, targets=None):
101
+ # idx is of shape (B, T)
102
+ B, T = idx.size()
103
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
104
+ # forward the token and posisition embeddings
105
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
106
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
107
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
108
+ x = tok_emb + pos_emb
109
+ # forward the blocks of the transformer
110
+ for block in self.transformer.h:
111
+ x = block(x)
112
+ # forward the final layernorm and the classifier
113
+ x = self.transformer.ln_f(x)
114
+ logits = self.lm_head(x) # (B, T, vocab_size)
115
+ loss = None
116
+ if targets is not None:
117
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
118
+ return logits, loss
119
+
120
+ @classmethod
121
+ def from_pretrained(cls, model_type):
122
+ """Loads pretrained GPT-2 model weights from huggingface"""
123
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
124
+ from transformers import GPT2LMHeadModel
125
+ print("loading weights from pretrained gpt: %s" % model_type)
126
+
127
+ # n_layer, n_head and n_embd are determined from model_type
128
+ config_args = {
129
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
130
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
131
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
132
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
133
+ }[model_type]
134
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
135
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
136
+ # create a from-scratch initialized minGPT model
137
+ config = GPTConfig(**config_args)
138
+ model = GPT(config)
139
+ sd = model.state_dict()
140
+ sd_keys = sd.keys()
141
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
142
+
143
+ # init a huggingface/transformers model
144
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
145
+ sd_hf = model_hf.state_dict()
146
+
147
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
148
+ sd_keys_hf = sd_hf.keys()
149
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
150
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
151
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
152
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
153
+ # this means that we have to transpose these weights when we import them
154
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
155
+ for k in sd_keys_hf:
156
+ if any(k.endswith(w) for w in transposed):
157
+ # special treatment for the Conv1D weights we need to transpose
158
+ assert sd_hf[k].shape[::-1] == sd[k].shape
159
+ with torch.no_grad():
160
+ sd[k].copy_(sd_hf[k].t())
161
+ else:
162
+ # vanilla copy over the other parameters
163
+ assert sd_hf[k].shape == sd[k].shape
164
+ with torch.no_grad():
165
+ sd[k].copy_(sd_hf[k])
166
+
167
+ return model
168
+
169
+ # model = GPT.from_pretrained('gpt2')
170
+ model = GPT(GPTConfig())
171
+ device = 'cpu'
172
+ if torch.cuda.is_available():
173
+ device = 'cuda'
174
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
175
+ device = "mps"
176
+ print(f"using device: {device}")
177
+ print("didn't crash yet!")
178
+ # STOP
179
+ num_return_sequences = 5
180
+ max_length = 30
181
+
182
+ model.eval()
183
+ model.to(device)
184
+
185
+ import tiktoken
186
+ enc = tiktoken.get_encoding('gpt2')
187
+ tokens = enc.encode("Hello, I'm a language model,")
188
+ tokens = torch.tensor(tokens, dtype= torch.long) # (8,) #check tiktoken app
189
+ tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8)
190
+ x = tokens.to(device)
191
+
192
+ torch.manual_seed(42)
193
+ torch.cuda.manual_seed(42)
194
+ while x.size(1) < max_length:
195
+ # forward the model to get the logits
196
+ with torch.no_grad():
197
+ logits = model(x)[0] # (B, T, vocab_size)
198
+ # take the logits at the last position
199
+ logits = logits[:, -1, :] # (B, vocab_size)
200
+ # get the probabilities
201
+ probs = F.softmax(logits, dim=-1)
202
+ # do top-k sampling of 50 (huggingface pipeline default)
203
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
204
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
205
+ # select a token from the top-k probabilities
206
+ # note: multinomial does not demand the input to sum to 1
207
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
208
+ # gather the corresponding indices
209
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
210
+ # append to the sequence
211
+ x = torch.cat((x, xcol), dim=1)
212
+
213
+ # print the generated text
214
+ for i in range(num_return_sequences):
215
+ tokens = x[i, :max_length].tolist()
216
+ decoded = enc.decode(tokens)
217
+ print(">", decoded)
CodeFiles/train_get2-3.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # adding the batch loading part for training
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ # regularization
22
+ self.n_head = config.n_head
23
+ self.n_embd = config.n_embd
24
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
25
+
26
+ def forward(self, x):
27
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
28
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
29
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
30
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
31
+ qkv = self.c_attn(x)
32
+ q, k, v = qkv.split(self.n_embd, dim=2)
33
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
34
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+
37
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
38
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
39
+ att = F.softmax(att, dim=-1)
40
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
41
+
42
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
43
+ # output projection
44
+ y = self.c_proj(y)
45
+ return y
46
+
47
+
48
+ class MLP(nn.Module):
49
+
50
+ def __init__(self, config):
51
+ super().__init__()
52
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
53
+ self.gelu = nn.GELU(approximate='tanh')
54
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
55
+ self.c_proj.NANOGPT_SCALE_INIT = 1
56
+
57
+ def forward(self, x):
58
+ x = self.c_fc(x)
59
+ x = self.gelu(x)
60
+ x = self.c_proj(x)
61
+ return x
62
+
63
+ class Block(nn.Module):
64
+
65
+ def __init__(self, config):
66
+ super().__init__()
67
+ self.ln_1 = nn.LayerNorm(config.n_embd)
68
+ self.attn = CausalSelfAttention(config)
69
+ self.ln_2 = nn.LayerNorm(config.n_embd)
70
+ self.mlp = MLP(config)
71
+
72
+ def forward(self, x):
73
+ x = x + self.attn(self.ln_1(x))
74
+ x = x + self.mlp(self.ln_2(x))
75
+ return x
76
+
77
+
78
+ @dataclass
79
+ class GPTConfig:
80
+ block_size: int = 1024 # max sequence length
81
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
82
+ n_layer: int = 12 # number of layers
83
+ n_head: int = 12 # number of heads
84
+ n_embd: int = 768 # embedding dimension
85
+
86
+
87
+ class GPT(nn.Module):
88
+
89
+ def __init__(self, config):
90
+ super().__init__()
91
+ self.config = config
92
+
93
+ self.transformer = nn.ModuleDict(dict(
94
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
95
+ wpe = nn.Embedding(config.block_size, config.n_embd),
96
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
97
+ ln_f = nn.LayerNorm(config.n_embd),
98
+ ))
99
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
100
+
101
+ def forward(self, idx, targets=None):
102
+ # idx is of shape (B, T)
103
+ B, T = idx.size()
104
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
105
+ # forward the token and posisition embeddings
106
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
107
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
108
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
109
+ x = tok_emb + pos_emb
110
+ # forward the blocks of the transformer
111
+ for block in self.transformer.h:
112
+ x = block(x)
113
+ # forward the final layernorm and the classifier
114
+ x = self.transformer.ln_f(x)
115
+ logits = self.lm_head(x) # (B, T, vocab_size)
116
+ loss = None
117
+ if targets is not None:
118
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
119
+ return logits, loss
120
+
121
+ @classmethod
122
+ def from_pretrained(cls, model_type):
123
+ """Loads pretrained GPT-2 model weights from huggingface"""
124
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
125
+ from transformers import GPT2LMHeadModel
126
+ print("loading weights from pretrained gpt: %s" % model_type)
127
+
128
+ # n_layer, n_head and n_embd are determined from model_type
129
+ config_args = {
130
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
131
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
132
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
133
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
134
+ }[model_type]
135
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
136
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
137
+ # create a from-scratch initialized minGPT model
138
+ config = GPTConfig(**config_args)
139
+ model = GPT(config)
140
+ sd = model.state_dict()
141
+ sd_keys = sd.keys()
142
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
143
+
144
+ # init a huggingface/transformers model
145
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
146
+ sd_hf = model_hf.state_dict()
147
+
148
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
149
+ sd_keys_hf = sd_hf.keys()
150
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
151
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
152
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
153
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
154
+ # this means that we have to transpose these weights when we import them
155
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
156
+ for k in sd_keys_hf:
157
+ if any(k.endswith(w) for w in transposed):
158
+ # special treatment for the Conv1D weights we need to transpose
159
+ assert sd_hf[k].shape[::-1] == sd[k].shape
160
+ with torch.no_grad():
161
+ sd[k].copy_(sd_hf[k].t())
162
+ else:
163
+ # vanilla copy over the other parameters
164
+ assert sd_hf[k].shape == sd[k].shape
165
+ with torch.no_grad():
166
+ sd[k].copy_(sd_hf[k])
167
+
168
+ return model
169
+
170
+ # model = GPT.from_pretrained('gpt2')
171
+
172
+ device = 'cpu'
173
+ if torch.cuda.is_available():
174
+ device = 'cuda'
175
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
176
+ device = "mps"
177
+ print(f"using device: {device}")
178
+ print("didn't crash yet!")
179
+ # STOP
180
+ num_return_sequences = 5
181
+ max_length = 30
182
+
183
+
184
+
185
+ import tiktoken
186
+ enc = tiktoken.get_encoding('gpt2')
187
+ with open('input.txt', 'r') as f:
188
+ text = f.read()
189
+
190
+ text = text[:1000]
191
+ tokens = enc.encode(text)
192
+ B, T = 4, 32
193
+ buf = torch.tensor(tokens[:B*T + 1])
194
+ buf = buf.to(device)
195
+ x = buf[:-1].view(B, T)
196
+ y = buf[1:].view(B, T)
197
+
198
+ model = GPT(GPTConfig())
199
+ model.to(device)
200
+
201
+ logits = model(x)
202
+ print(logits[0].shape)
203
+ import sys; sys.exit(0)
204
+ torch.manual_seed(42)
205
+ torch.cuda.manual_seed(42)
206
+ while x.size(1) < max_length:
207
+ # forward the model to get the logits
208
+ with torch.no_grad():
209
+ logits = model(x)[0] # (B, T, vocab_size)
210
+ # take the logits at the last position
211
+ logits = logits[:, -1, :] # (B, vocab_size)
212
+ # get the probabilities
213
+ probs = F.softmax(logits, dim=-1)
214
+ # do top-k sampling of 50 (huggingface pipeline default)
215
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
216
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
217
+ # select a token from the top-k probabilities
218
+ # note: multinomial does not demand the input to sum to 1
219
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
220
+ # gather the corresponding indices
221
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
222
+ # append to the sequence
223
+ x = torch.cat((x, xcol), dim=1)
224
+
225
+ # print the generated text
226
+ for i in range(num_return_sequences):
227
+ tokens = x[i, :max_length].tolist()
228
+ decoded = enc.decode(tokens)
229
+ print(">", decoded)
CodeFiles/train_get2-4.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # adding the batch loading part for training
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ # regularization
22
+ self.n_head = config.n_head
23
+ self.n_embd = config.n_embd
24
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
25
+
26
+ def forward(self, x):
27
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
28
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
29
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
30
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
31
+ qkv = self.c_attn(x)
32
+ q, k, v = qkv.split(self.n_embd, dim=2)
33
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
34
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+
37
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
38
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
39
+ att = F.softmax(att, dim=-1)
40
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
41
+
42
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
43
+ # output projection
44
+ y = self.c_proj(y)
45
+ return y
46
+
47
+
48
+ class MLP(nn.Module):
49
+
50
+ def __init__(self, config):
51
+ super().__init__()
52
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
53
+ self.gelu = nn.GELU(approximate='tanh')
54
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
55
+ self.c_proj.NANOGPT_SCALE_INIT = 1
56
+
57
+ def forward(self, x):
58
+ x = self.c_fc(x)
59
+ x = self.gelu(x)
60
+ x = self.c_proj(x)
61
+ return x
62
+
63
+ class Block(nn.Module):
64
+
65
+ def __init__(self, config):
66
+ super().__init__()
67
+ self.ln_1 = nn.LayerNorm(config.n_embd)
68
+ self.attn = CausalSelfAttention(config)
69
+ self.ln_2 = nn.LayerNorm(config.n_embd)
70
+ self.mlp = MLP(config)
71
+
72
+ def forward(self, x):
73
+ x = x + self.attn(self.ln_1(x))
74
+ x = x + self.mlp(self.ln_2(x))
75
+ return x
76
+
77
+
78
+ @dataclass
79
+ class GPTConfig:
80
+ block_size: int = 1024 # max sequence length
81
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
82
+ n_layer: int = 12 # number of layers
83
+ n_head: int = 12 # number of heads
84
+ n_embd: int = 768 # embedding dimension
85
+
86
+
87
+ class GPT(nn.Module):
88
+
89
+ def __init__(self, config):
90
+ super().__init__()
91
+ self.config = config
92
+
93
+ self.transformer = nn.ModuleDict(dict(
94
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
95
+ wpe = nn.Embedding(config.block_size, config.n_embd),
96
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
97
+ ln_f = nn.LayerNorm(config.n_embd),
98
+ ))
99
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
100
+
101
+ def forward(self, idx, targets=None):
102
+ # idx is of shape (B, T)
103
+ B, T = idx.size()
104
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
105
+ # forward the token and posisition embeddings
106
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
107
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
108
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
109
+ x = tok_emb + pos_emb
110
+ # forward the blocks of the transformer
111
+ for block in self.transformer.h:
112
+ x = block(x)
113
+ # forward the final layernorm and the classifier
114
+ x = self.transformer.ln_f(x)
115
+ logits = self.lm_head(x) # (B, T, vocab_size)
116
+ loss = None
117
+ if targets is not None:
118
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
119
+ return logits, loss
120
+
121
+ @classmethod
122
+ def from_pretrained(cls, model_type):
123
+ """Loads pretrained GPT-2 model weights from huggingface"""
124
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
125
+ from transformers import GPT2LMHeadModel
126
+ print("loading weights from pretrained gpt: %s" % model_type)
127
+
128
+ # n_layer, n_head and n_embd are determined from model_type
129
+ config_args = {
130
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
131
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
132
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
133
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
134
+ }[model_type]
135
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
136
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
137
+ # create a from-scratch initialized minGPT model
138
+ config = GPTConfig(**config_args)
139
+ model = GPT(config)
140
+ sd = model.state_dict()
141
+ sd_keys = sd.keys()
142
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
143
+
144
+ # init a huggingface/transformers model
145
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
146
+ sd_hf = model_hf.state_dict()
147
+
148
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
149
+ sd_keys_hf = sd_hf.keys()
150
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
151
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
152
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
153
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
154
+ # this means that we have to transpose these weights when we import them
155
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
156
+ for k in sd_keys_hf:
157
+ if any(k.endswith(w) for w in transposed):
158
+ # special treatment for the Conv1D weights we need to transpose
159
+ assert sd_hf[k].shape[::-1] == sd[k].shape
160
+ with torch.no_grad():
161
+ sd[k].copy_(sd_hf[k].t())
162
+ else:
163
+ # vanilla copy over the other parameters
164
+ assert sd_hf[k].shape == sd[k].shape
165
+ with torch.no_grad():
166
+ sd[k].copy_(sd_hf[k])
167
+
168
+ return model
169
+
170
+ # model = GPT.from_pretrained('gpt2')
171
+
172
+ device = 'cpu'
173
+ if torch.cuda.is_available():
174
+ device = 'cuda'
175
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
176
+ device = "mps"
177
+ print(f"using device: {device}")
178
+ print("didn't crash yet!")
179
+ # STOP
180
+ num_return_sequences = 5
181
+ max_length = 30
182
+
183
+
184
+
185
+ import tiktoken
186
+ enc = tiktoken.get_encoding('gpt2')
187
+ with open('input.txt', 'r') as f:
188
+ text = f.read()
189
+
190
+ text = text[:1000]
191
+ tokens = enc.encode(text)
192
+ B, T = 4, 32
193
+ buf = torch.tensor(tokens[:B*T + 1])
194
+ buf = buf.to(device)
195
+ x = buf[:-1].view(B, T)
196
+ y = buf[1:].view(B, T)
197
+
198
+ model = GPT(GPTConfig())
199
+ model.to(device)
200
+
201
+ logits, loss = model(x, y)
202
+ print(loss)
203
+ import sys; sys.exit(0)
204
+
205
+
206
+
207
+ torch.manual_seed(42)
208
+ torch.cuda.manual_seed(42)
209
+ while x.size(1) < max_length:
210
+ # forward the model to get the logits
211
+ with torch.no_grad():
212
+ logits = model(x)[0] # (B, T, vocab_size)
213
+ # take the logits at the last position
214
+ logits = logits[:, -1, :] # (B, vocab_size)
215
+ # get the probabilities
216
+ probs = F.softmax(logits, dim=-1)
217
+ # do top-k sampling of 50 (huggingface pipeline default)
218
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
219
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
220
+ # select a token from the top-k probabilities
221
+ # note: multinomial does not demand the input to sum to 1
222
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
223
+ # gather the corresponding indices
224
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
225
+ # append to the sequence
226
+ x = torch.cat((x, xcol), dim=1)
227
+
228
+ # print the generated text
229
+ for i in range(num_return_sequences):
230
+ tokens = x[i, :max_length].tolist()
231
+ decoded = enc.decode(tokens)
232
+ print(">", decoded)
CodeFiles/train_get2-5.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # adding the batch loading part for training
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ # regularization
22
+ self.n_head = config.n_head
23
+ self.n_embd = config.n_embd
24
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
25
+
26
+ def forward(self, x):
27
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
28
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
29
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
30
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
31
+ qkv = self.c_attn(x)
32
+ q, k, v = qkv.split(self.n_embd, dim=2)
33
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
34
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+
37
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
38
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
39
+ att = F.softmax(att, dim=-1)
40
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
41
+
42
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
43
+ # output projection
44
+ y = self.c_proj(y)
45
+ return y
46
+
47
+
48
+ class MLP(nn.Module):
49
+
50
+ def __init__(self, config):
51
+ super().__init__()
52
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
53
+ self.gelu = nn.GELU(approximate='tanh')
54
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
55
+ self.c_proj.NANOGPT_SCALE_INIT = 1
56
+
57
+ def forward(self, x):
58
+ x = self.c_fc(x)
59
+ x = self.gelu(x)
60
+ x = self.c_proj(x)
61
+ return x
62
+
63
+ class Block(nn.Module):
64
+
65
+ def __init__(self, config):
66
+ super().__init__()
67
+ self.ln_1 = nn.LayerNorm(config.n_embd)
68
+ self.attn = CausalSelfAttention(config)
69
+ self.ln_2 = nn.LayerNorm(config.n_embd)
70
+ self.mlp = MLP(config)
71
+
72
+ def forward(self, x):
73
+ x = x + self.attn(self.ln_1(x))
74
+ x = x + self.mlp(self.ln_2(x))
75
+ return x
76
+
77
+
78
+ @dataclass
79
+ class GPTConfig:
80
+ block_size: int = 1024 # max sequence length
81
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
82
+ n_layer: int = 12 # number of layers
83
+ n_head: int = 12 # number of heads
84
+ n_embd: int = 768 # embedding dimension
85
+
86
+
87
+ class GPT(nn.Module):
88
+
89
+ def __init__(self, config):
90
+ super().__init__()
91
+ self.config = config
92
+
93
+ self.transformer = nn.ModuleDict(dict(
94
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
95
+ wpe = nn.Embedding(config.block_size, config.n_embd),
96
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
97
+ ln_f = nn.LayerNorm(config.n_embd),
98
+ ))
99
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
100
+
101
+ def forward(self, idx, targets=None):
102
+ # idx is of shape (B, T)
103
+ B, T = idx.size()
104
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
105
+ # forward the token and posisition embeddings
106
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
107
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
108
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
109
+ x = tok_emb + pos_emb
110
+ # forward the blocks of the transformer
111
+ for block in self.transformer.h:
112
+ x = block(x)
113
+ # forward the final layernorm and the classifier
114
+ x = self.transformer.ln_f(x)
115
+ logits = self.lm_head(x) # (B, T, vocab_size)
116
+ loss = None
117
+ if targets is not None:
118
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
119
+ return logits, loss
120
+
121
+ @classmethod
122
+ def from_pretrained(cls, model_type):
123
+ """Loads pretrained GPT-2 model weights from huggingface"""
124
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
125
+ from transformers import GPT2LMHeadModel
126
+ print("loading weights from pretrained gpt: %s" % model_type)
127
+
128
+ # n_layer, n_head and n_embd are determined from model_type
129
+ config_args = {
130
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
131
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
132
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
133
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
134
+ }[model_type]
135
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
136
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
137
+ # create a from-scratch initialized minGPT model
138
+ config = GPTConfig(**config_args)
139
+ model = GPT(config)
140
+ sd = model.state_dict()
141
+ sd_keys = sd.keys()
142
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
143
+
144
+ # init a huggingface/transformers model
145
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
146
+ sd_hf = model_hf.state_dict()
147
+
148
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
149
+ sd_keys_hf = sd_hf.keys()
150
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
151
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
152
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
153
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
154
+ # this means that we have to transpose these weights when we import them
155
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
156
+ for k in sd_keys_hf:
157
+ if any(k.endswith(w) for w in transposed):
158
+ # special treatment for the Conv1D weights we need to transpose
159
+ assert sd_hf[k].shape[::-1] == sd[k].shape
160
+ with torch.no_grad():
161
+ sd[k].copy_(sd_hf[k].t())
162
+ else:
163
+ # vanilla copy over the other parameters
164
+ assert sd_hf[k].shape == sd[k].shape
165
+ with torch.no_grad():
166
+ sd[k].copy_(sd_hf[k])
167
+
168
+ return model
169
+
170
+ # model = GPT.from_pretrained('gpt2')
171
+
172
+ device = 'cpu'
173
+ if torch.cuda.is_available():
174
+ device = 'cuda'
175
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
176
+ device = "mps"
177
+ print(f"using device: {device}")
178
+ print("didn't crash yet!")
179
+ # STOP
180
+ num_return_sequences = 5
181
+ max_length = 30
182
+
183
+
184
+
185
+ import tiktoken
186
+ enc = tiktoken.get_encoding('gpt2')
187
+ with open('input.txt', 'r') as f:
188
+ text = f.read()
189
+
190
+ text = text[:1000]
191
+ tokens = enc.encode(text)
192
+ B, T = 4, 32
193
+ buf = torch.tensor(tokens[:B*T + 1])
194
+ buf = buf.to(device)
195
+ x = buf[:-1].view(B, T)
196
+ y = buf[1:].view(B, T)
197
+
198
+ model = GPT(GPTConfig())
199
+ model.to(device)
200
+
201
+ # NEW CODE
202
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
203
+ for i in range(50):
204
+ optimizer.zero_grad()
205
+ logits, loss = model(x, y)
206
+ loss.backward()
207
+ optimizer.step()
208
+ print(f'step{i}, loss: {loss.item()}')
209
+
210
+
211
+ print(loss)
212
+ import sys; sys.exit(0)
213
+
214
+ torch.manual_seed(42)
215
+ torch.cuda.manual_seed(42)
216
+ while x.size(1) < max_length:
217
+ # forward the model to get the logits
218
+ with torch.no_grad():
219
+ logits = model(x)[0] # (B, T, vocab_size)
220
+ # take the logits at the last position
221
+ logits = logits[:, -1, :] # (B, vocab_size)
222
+ # get the probabilities
223
+ probs = F.softmax(logits, dim=-1)
224
+ # do top-k sampling of 50 (huggingface pipeline default)
225
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
226
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
227
+ # select a token from the top-k probabilities
228
+ # note: multinomial does not demand the input to sum to 1
229
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
230
+ # gather the corresponding indices
231
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
232
+ # append to the sequence
233
+ x = torch.cat((x, xcol), dim=1)
234
+
235
+ # print the generated text
236
+ for i in range(num_return_sequences):
237
+ tokens = x[i, :max_length].tolist()
238
+ decoded = enc.decode(tokens)
239
+ print(">", decoded)
CodeFiles/train_get2-6.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DATALOADER
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ # regularization
22
+ self.n_head = config.n_head
23
+ self.n_embd = config.n_embd
24
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
25
+
26
+ def forward(self, x):
27
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
28
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
29
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
30
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
31
+ qkv = self.c_attn(x)
32
+ q, k, v = qkv.split(self.n_embd, dim=2)
33
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
34
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+
37
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
38
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
39
+ att = F.softmax(att, dim=-1)
40
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
41
+
42
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
43
+ # output projection
44
+ y = self.c_proj(y)
45
+ return y
46
+
47
+
48
+ class MLP(nn.Module):
49
+
50
+ def __init__(self, config):
51
+ super().__init__()
52
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
53
+ self.gelu = nn.GELU(approximate='tanh')
54
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
55
+ self.c_proj.NANOGPT_SCALE_INIT = 1
56
+
57
+ def forward(self, x):
58
+ x = self.c_fc(x)
59
+ x = self.gelu(x)
60
+ x = self.c_proj(x)
61
+ return x
62
+
63
+ class Block(nn.Module):
64
+
65
+ def __init__(self, config):
66
+ super().__init__()
67
+ self.ln_1 = nn.LayerNorm(config.n_embd)
68
+ self.attn = CausalSelfAttention(config)
69
+ self.ln_2 = nn.LayerNorm(config.n_embd)
70
+ self.mlp = MLP(config)
71
+
72
+ def forward(self, x):
73
+ x = x + self.attn(self.ln_1(x))
74
+ x = x + self.mlp(self.ln_2(x))
75
+ return x
76
+
77
+
78
+ @dataclass
79
+ class GPTConfig:
80
+ block_size: int = 1024 # max sequence length
81
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
82
+ n_layer: int = 12 # number of layers
83
+ n_head: int = 12 # number of heads
84
+ n_embd: int = 768 # embedding dimension
85
+
86
+
87
+ class GPT(nn.Module):
88
+
89
+ def __init__(self, config):
90
+ super().__init__()
91
+ self.config = config
92
+
93
+ self.transformer = nn.ModuleDict(dict(
94
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
95
+ wpe = nn.Embedding(config.block_size, config.n_embd),
96
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
97
+ ln_f = nn.LayerNorm(config.n_embd),
98
+ ))
99
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
100
+
101
+ def forward(self, idx, targets=None):
102
+ # idx is of shape (B, T)
103
+ B, T = idx.size()
104
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
105
+ # forward the token and posisition embeddings
106
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
107
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
108
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
109
+ x = tok_emb + pos_emb
110
+ # forward the blocks of the transformer
111
+ for block in self.transformer.h:
112
+ x = block(x)
113
+ # forward the final layernorm and the classifier
114
+ x = self.transformer.ln_f(x)
115
+ logits = self.lm_head(x) # (B, T, vocab_size)
116
+ loss = None
117
+ if targets is not None:
118
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
119
+ return logits, loss
120
+
121
+ @classmethod
122
+ def from_pretrained(cls, model_type):
123
+ """Loads pretrained GPT-2 model weights from huggingface"""
124
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
125
+ from transformers import GPT2LMHeadModel
126
+ print("loading weights from pretrained gpt: %s" % model_type)
127
+
128
+ # n_layer, n_head and n_embd are determined from model_type
129
+ config_args = {
130
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
131
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
132
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
133
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
134
+ }[model_type]
135
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
136
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
137
+ # create a from-scratch initialized minGPT model
138
+ config = GPTConfig(**config_args)
139
+ model = GPT(config)
140
+ sd = model.state_dict()
141
+ sd_keys = sd.keys()
142
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
143
+
144
+ # init a huggingface/transformers model
145
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
146
+ sd_hf = model_hf.state_dict()
147
+
148
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
149
+ sd_keys_hf = sd_hf.keys()
150
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
151
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
152
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
153
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
154
+ # this means that we have to transpose these weights when we import them
155
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
156
+ for k in sd_keys_hf:
157
+ if any(k.endswith(w) for w in transposed):
158
+ # special treatment for the Conv1D weights we need to transpose
159
+ assert sd_hf[k].shape[::-1] == sd[k].shape
160
+ with torch.no_grad():
161
+ sd[k].copy_(sd_hf[k].t())
162
+ else:
163
+ # vanilla copy over the other parameters
164
+ assert sd_hf[k].shape == sd[k].shape
165
+ with torch.no_grad():
166
+ sd[k].copy_(sd_hf[k])
167
+
168
+ return model
169
+
170
+ # model = GPT.from_pretrained('gpt2')
171
+
172
+ device = 'cpu'
173
+ if torch.cuda.is_available():
174
+ device = 'cuda'
175
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
176
+ device = "mps"
177
+ print(f"using device: {device}")
178
+
179
+ # STOP
180
+ num_return_sequences = 5
181
+ max_length = 30
182
+
183
+
184
+
185
+ import tiktoken
186
+
187
+ class DataLoaderLite:
188
+ def __init__(self, B, T):
189
+ self.B = B
190
+ self.T = T
191
+
192
+ # at init load tokens from disk and store them in memory
193
+ with open('input.txt', 'r') as f:
194
+ text = f.read()
195
+ enc = tiktoken.get_encoding('gpt2')
196
+ tokens = enc.encode(text)
197
+ self.tokens = torch.tensor(tokens)
198
+ print(f'loaded {len(self.tokens)} tokens')
199
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
200
+
201
+ # state
202
+ self.current_position = 0
203
+
204
+ def next_batch(self):
205
+ B, T = self.B, self.T
206
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
207
+ x = (buf[:-1]).view(B, T) # inputs
208
+ y = (buf[1:]).view(B, T) # targets
209
+ # advance the position in the tensor
210
+ self.current_position += B*T
211
+ # if loading the next batch would be out of bounds, reset
212
+ if self.current_position + (B * T + 1) > len(self.tokens):
213
+ self.current_position = 0
214
+ return x, y
215
+
216
+
217
+ model = GPT(GPTConfig())
218
+ model.to(device)
219
+
220
+ train_loader = DataLoaderLite(B = 4, T = 32)
221
+
222
+ # NEW CODE
223
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
224
+ for i in range(50):
225
+ x, y = train_loader.next_batch()
226
+ x, y = x.to(device), y.to(device)
227
+ optimizer.zero_grad()
228
+ logits, loss = model(x, y)
229
+ loss.backward()
230
+ optimizer.step()
231
+ print(f'step{i}, loss: {loss.item()}')
232
+
233
+
234
+ print(loss)
235
+ import sys; sys.exit(0)
236
+
237
+ torch.manual_seed(42)
238
+ torch.cuda.manual_seed(42)
239
+ while x.size(1) < max_length:
240
+ # forward the model to get the logits
241
+ with torch.no_grad():
242
+ logits = model(x)[0] # (B, T, vocab_size)
243
+ # take the logits at the last position
244
+ logits = logits[:, -1, :] # (B, vocab_size)
245
+ # get the probabilities
246
+ probs = F.softmax(logits, dim=-1)
247
+ # do top-k sampling of 50 (huggingface pipeline default)
248
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
249
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
250
+ # select a token from the top-k probabilities
251
+ # note: multinomial does not demand the input to sum to 1
252
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
253
+ # gather the corresponding indices
254
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
255
+ # append to the sequence
256
+ x = torch.cat((x, xcol), dim=1)
257
+
258
+ # print the generated text
259
+ for i in range(num_return_sequences):
260
+ tokens = x[i, :max_length].tolist()
261
+ decoded = enc.decode(tokens)
262
+ print(">", decoded)
CodeFiles/train_get2-7.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Weight Sharing
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ # regularization
22
+ self.n_head = config.n_head
23
+ self.n_embd = config.n_embd
24
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
25
+
26
+ def forward(self, x):
27
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
28
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
29
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
30
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
31
+ qkv = self.c_attn(x)
32
+ q, k, v = qkv.split(self.n_embd, dim=2)
33
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
34
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+
37
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
38
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
39
+ att = F.softmax(att, dim=-1)
40
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
41
+
42
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
43
+ # output projection
44
+ y = self.c_proj(y)
45
+ return y
46
+
47
+
48
+ class MLP(nn.Module):
49
+
50
+ def __init__(self, config):
51
+ super().__init__()
52
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
53
+ self.gelu = nn.GELU(approximate='tanh')
54
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
55
+ self.c_proj.NANOGPT_SCALE_INIT = 1
56
+
57
+ def forward(self, x):
58
+ x = self.c_fc(x)
59
+ x = self.gelu(x)
60
+ x = self.c_proj(x)
61
+ return x
62
+
63
+ class Block(nn.Module):
64
+
65
+ def __init__(self, config):
66
+ super().__init__()
67
+ self.ln_1 = nn.LayerNorm(config.n_embd)
68
+ self.attn = CausalSelfAttention(config)
69
+ self.ln_2 = nn.LayerNorm(config.n_embd)
70
+ self.mlp = MLP(config)
71
+
72
+ def forward(self, x):
73
+ x = x + self.attn(self.ln_1(x))
74
+ x = x + self.mlp(self.ln_2(x))
75
+ return x
76
+
77
+
78
+ @dataclass
79
+ class GPTConfig:
80
+ block_size: int = 1024 # max sequence length
81
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
82
+ n_layer: int = 12 # number of layers
83
+ n_head: int = 12 # number of heads
84
+ n_embd: int = 768 # embedding dimension
85
+
86
+
87
+ class GPT(nn.Module):
88
+
89
+ def __init__(self, config):
90
+ super().__init__()
91
+ self.config = config
92
+
93
+ self.transformer = nn.ModuleDict(dict(
94
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
95
+ wpe = nn.Embedding(config.block_size, config.n_embd),
96
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
97
+ ln_f = nn.LayerNorm(config.n_embd),
98
+ ))
99
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
100
+
101
+ # weight sharing
102
+ self.transformer.wte.weight = self.lm_head.weight
103
+
104
+ # weight initialization
105
+ self.apply(self._init_weights)
106
+
107
+ def _init_weights(self, module):
108
+ if isinstance(module, nn.Linear):
109
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
110
+ if module.bias is not None:
111
+ torch.nn.init.zeros_(module.bias)
112
+ elif isinstance(module, nn.Embedding):
113
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
114
+
115
+
116
+
117
+ def forward(self, idx, targets=None):
118
+ # idx is of shape (B, T)
119
+ B, T = idx.size()
120
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
121
+ # forward the token and posisition embeddings
122
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
123
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
124
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
125
+ x = tok_emb + pos_emb
126
+ # forward the blocks of the transformer
127
+ for block in self.transformer.h:
128
+ x = block(x)
129
+ # forward the final layernorm and the classifier
130
+ x = self.transformer.ln_f(x)
131
+ logits = self.lm_head(x) # (B, T, vocab_size)
132
+ loss = None
133
+ if targets is not None:
134
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
135
+ return logits, loss
136
+
137
+ @classmethod
138
+ def from_pretrained(cls, model_type):
139
+ """Loads pretrained GPT-2 model weights from huggingface"""
140
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
141
+ from transformers import GPT2LMHeadModel
142
+ print("loading weights from pretrained gpt: %s" % model_type)
143
+
144
+ # n_layer, n_head and n_embd are determined from model_type
145
+ config_args = {
146
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
147
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
148
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
149
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
150
+ }[model_type]
151
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
152
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
153
+ # create a from-scratch initialized minGPT model
154
+ config = GPTConfig(**config_args)
155
+ model = GPT(config)
156
+ sd = model.state_dict()
157
+ sd_keys = sd.keys()
158
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
159
+
160
+ # init a huggingface/transformers model
161
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
162
+ sd_hf = model_hf.state_dict()
163
+
164
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
165
+ sd_keys_hf = sd_hf.keys()
166
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
167
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
168
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
169
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
170
+ # this means that we have to transpose these weights when we import them
171
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
172
+ for k in sd_keys_hf:
173
+ if any(k.endswith(w) for w in transposed):
174
+ # special treatment for the Conv1D weights we need to transpose
175
+ assert sd_hf[k].shape[::-1] == sd[k].shape
176
+ with torch.no_grad():
177
+ sd[k].copy_(sd_hf[k].t())
178
+ else:
179
+ # vanilla copy over the other parameters
180
+ assert sd_hf[k].shape == sd[k].shape
181
+ with torch.no_grad():
182
+ sd[k].copy_(sd_hf[k])
183
+
184
+ return model
185
+
186
+ # model = GPT.from_pretrained('gpt2')
187
+
188
+ device = 'cpu'
189
+ if torch.cuda.is_available():
190
+ device = 'cuda'
191
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
192
+ device = "mps"
193
+ print(f"using device: {device}")
194
+
195
+ # STOP
196
+ num_return_sequences = 5
197
+ max_length = 30
198
+
199
+
200
+
201
+ import tiktoken
202
+
203
+ class DataLoaderLite:
204
+ def __init__(self, B, T):
205
+ self.B = B
206
+ self.T = T
207
+
208
+ # at init load tokens from disk and store them in memory
209
+ with open('input.txt', 'r') as f:
210
+ text = f.read()
211
+ enc = tiktoken.get_encoding('gpt2')
212
+ tokens = enc.encode(text)
213
+ self.tokens = torch.tensor(tokens)
214
+ print(f'loaded {len(self.tokens)} tokens')
215
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
216
+
217
+ # state
218
+ self.current_position = 0
219
+
220
+ def next_batch(self):
221
+ B, T = self.B, self.T
222
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
223
+ x = (buf[:-1]).view(B, T) # inputs
224
+ y = (buf[1:]).view(B, T) # targets
225
+ # advance the position in the tensor
226
+ self.current_position += B*T
227
+ # if loading the next batch would be out of bounds, reset
228
+ if self.current_position + (B * T + 1) > len(self.tokens):
229
+ self.current_position = 0
230
+ return x, y
231
+
232
+
233
+ model = GPT(GPTConfig())
234
+ model.to(device)
235
+
236
+ train_loader = DataLoaderLite(B = 4, T = 32)
237
+
238
+ # NEW CODE
239
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
240
+ for i in range(50):
241
+ x, y = train_loader.next_batch()
242
+ x, y = x.to(device), y.to(device)
243
+ optimizer.zero_grad()
244
+ logits, loss = model(x, y)
245
+ loss.backward()
246
+ optimizer.step()
247
+ print(f'step{i}, loss: {loss.item()}')
248
+
249
+
250
+ print(loss)
251
+ import sys; sys.exit(0)
252
+
253
+ torch.manual_seed(42)
254
+ torch.cuda.manual_seed(42)
255
+ while x.size(1) < max_length:
256
+ # forward the model to get the logits
257
+ with torch.no_grad():
258
+ logits = model(x)[0] # (B, T, vocab_size)
259
+ # take the logits at the last position
260
+ logits = logits[:, -1, :] # (B, vocab_size)
261
+ # get the probabilities
262
+ probs = F.softmax(logits, dim=-1)
263
+ # do top-k sampling of 50 (huggingface pipeline default)
264
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
265
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
266
+ # select a token from the top-k probabilities
267
+ # note: multinomial does not demand the input to sum to 1
268
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
269
+ # gather the corresponding indices
270
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
271
+ # append to the sequence
272
+ x = torch.cat((x, xcol), dim=1)
273
+
274
+ # print the generated text
275
+ for i in range(num_return_sequences):
276
+ tokens = x[i, :max_length].tolist()
277
+ decoded = enc.decode(tokens)
278
+ print(">", decoded)
CodeFiles/train_get2-8-init.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Solving for residual std scaling issue
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ self.c_proj.NANGPT_SCALE_INIT = 1
22
+ # regularization
23
+ self.n_head = config.n_head
24
+ self.n_embd = config.n_embd
25
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
26
+
27
+ def forward(self, x):
28
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
29
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
30
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
31
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
32
+ qkv = self.c_attn(x)
33
+ q, k, v = qkv.split(self.n_embd, dim=2)
34
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
37
+
38
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
39
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
40
+ att = F.softmax(att, dim=-1)
41
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
42
+
43
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
44
+ # output projection
45
+ y = self.c_proj(y)
46
+ return y
47
+
48
+
49
+ class MLP(nn.Module):
50
+
51
+ def __init__(self, config):
52
+ super().__init__()
53
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
54
+ self.gelu = nn.GELU(approximate='tanh')
55
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
56
+ self.c_proj.NANOGPT_SCALE_INIT = 1
57
+
58
+ def forward(self, x):
59
+ x = self.c_fc(x)
60
+ x = self.gelu(x)
61
+ x = self.c_proj(x)
62
+ return x
63
+
64
+ class Block(nn.Module):
65
+
66
+ def __init__(self, config):
67
+ super().__init__()
68
+ self.ln_1 = nn.LayerNorm(config.n_embd)
69
+ self.attn = CausalSelfAttention(config)
70
+ self.ln_2 = nn.LayerNorm(config.n_embd)
71
+ self.mlp = MLP(config)
72
+
73
+ def forward(self, x):
74
+ x = x + self.attn(self.ln_1(x))
75
+ x = x + self.mlp(self.ln_2(x))
76
+ return x
77
+
78
+
79
+ @dataclass
80
+ class GPTConfig:
81
+ block_size: int = 1024 # max sequence length
82
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
83
+ n_layer: int = 12 # number of layers
84
+ n_head: int = 12 # number of heads
85
+ n_embd: int = 768 # embedding dimension
86
+
87
+
88
+ class GPT(nn.Module):
89
+
90
+ def __init__(self, config):
91
+ super().__init__()
92
+ self.config = config
93
+
94
+ self.transformer = nn.ModuleDict(dict(
95
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
96
+ wpe = nn.Embedding(config.block_size, config.n_embd),
97
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
98
+ ln_f = nn.LayerNorm(config.n_embd),
99
+ ))
100
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
101
+
102
+ # weight sharing
103
+ self.transformer.wte.weight = self.lm_head.weight
104
+
105
+ # weight initialization
106
+ self.apply(self._init_weights)
107
+
108
+ def _init_weights(self, module):
109
+ if isinstance(module, nn.Linear):
110
+ std = 0.02
111
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
112
+ std *= (2 * self.config.n_layer) ** -0.5
113
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
114
+ if module.bias is not None:
115
+ torch.nn.init.zeros_(module.bias)
116
+ elif isinstance(module, nn.Embedding):
117
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
118
+
119
+
120
+
121
+ def forward(self, idx, targets=None):
122
+ # idx is of shape (B, T)
123
+ B, T = idx.size()
124
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
125
+ # forward the token and posisition embeddings
126
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
127
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
128
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
129
+ x = tok_emb + pos_emb
130
+ # forward the blocks of the transformer
131
+ for block in self.transformer.h:
132
+ x = block(x)
133
+ # forward the final layernorm and the classifier
134
+ x = self.transformer.ln_f(x)
135
+ logits = self.lm_head(x) # (B, T, vocab_size)
136
+ loss = None
137
+ if targets is not None:
138
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
139
+ return logits, loss
140
+
141
+ @classmethod
142
+ def from_pretrained(cls, model_type):
143
+ """Loads pretrained GPT-2 model weights from huggingface"""
144
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
145
+ from transformers import GPT2LMHeadModel
146
+ print("loading weights from pretrained gpt: %s" % model_type)
147
+
148
+ # n_layer, n_head and n_embd are determined from model_type
149
+ config_args = {
150
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
151
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
152
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
153
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
154
+ }[model_type]
155
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
156
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
157
+ # create a from-scratch initialized minGPT model
158
+ config = GPTConfig(**config_args)
159
+ model = GPT(config)
160
+ sd = model.state_dict()
161
+ sd_keys = sd.keys()
162
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
163
+
164
+ # init a huggingface/transformers model
165
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
166
+ sd_hf = model_hf.state_dict()
167
+
168
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
169
+ sd_keys_hf = sd_hf.keys()
170
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
171
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
172
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
173
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
174
+ # this means that we have to transpose these weights when we import them
175
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
176
+ for k in sd_keys_hf:
177
+ if any(k.endswith(w) for w in transposed):
178
+ # special treatment for the Conv1D weights we need to transpose
179
+ assert sd_hf[k].shape[::-1] == sd[k].shape
180
+ with torch.no_grad():
181
+ sd[k].copy_(sd_hf[k].t())
182
+ else:
183
+ # vanilla copy over the other parameters
184
+ assert sd_hf[k].shape == sd[k].shape
185
+ with torch.no_grad():
186
+ sd[k].copy_(sd_hf[k])
187
+
188
+ return model
189
+
190
+ # model = GPT.from_pretrained('gpt2')
191
+
192
+ device = 'cpu'
193
+ if torch.cuda.is_available():
194
+ device = 'cuda'
195
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
196
+ device = "mps"
197
+ print(f"using device: {device}")
198
+
199
+ # SEED
200
+ torch.manual_seed(1337)
201
+ if torch.cuda.is_available():
202
+ torch.cuda.manual_seed(1337)
203
+
204
+ # STOP
205
+ num_return_sequences = 5
206
+ max_length = 30
207
+
208
+
209
+
210
+ import tiktoken
211
+
212
+ class DataLoaderLite:
213
+ def __init__(self, B, T):
214
+ self.B = B
215
+ self.T = T
216
+
217
+ # at init load tokens from disk and store them in memory
218
+ with open('input.txt', 'r') as f:
219
+ text = f.read()
220
+ enc = tiktoken.get_encoding('gpt2')
221
+ tokens = enc.encode(text)
222
+ self.tokens = torch.tensor(tokens)
223
+ print(f'loaded {len(self.tokens)} tokens')
224
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
225
+
226
+ # state
227
+ self.current_position = 0
228
+
229
+ def next_batch(self):
230
+ B, T = self.B, self.T
231
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
232
+ x = (buf[:-1]).view(B, T) # inputs
233
+ y = (buf[1:]).view(B, T) # targets
234
+ # advance the position in the tensor
235
+ self.current_position += B*T
236
+ # if loading the next batch would be out of bounds, reset
237
+ if self.current_position + (B * T + 1) > len(self.tokens):
238
+ self.current_position = 0
239
+ return x, y
240
+
241
+
242
+ model = GPT(GPTConfig())
243
+ model.to(device)
244
+
245
+ train_loader = DataLoaderLite(B = 4, T = 32)
246
+
247
+ # NEW CODE
248
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
249
+ for i in range(50):
250
+ x, y = train_loader.next_batch()
251
+ x, y = x.to(device), y.to(device)
252
+ optimizer.zero_grad()
253
+ logits, loss = model(x, y)
254
+ loss.backward()
255
+ optimizer.step()
256
+ print(f'step{i}, loss: {loss.item()}')
257
+
258
+
259
+ print(loss)
260
+ import sys; sys.exit(0)
261
+
262
+ torch.manual_seed(42)
263
+ torch.cuda.manual_seed(42)
264
+ while x.size(1) < max_length:
265
+ # forward the model to get the logits
266
+ with torch.no_grad():
267
+ logits = model(x)[0] # (B, T, vocab_size)
268
+ # take the logits at the last position
269
+ logits = logits[:, -1, :] # (B, vocab_size)
270
+ # get the probabilities
271
+ probs = F.softmax(logits, dim=-1)
272
+ # do top-k sampling of 50 (huggingface pipeline default)
273
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
274
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
275
+ # select a token from the top-k probabilities
276
+ # note: multinomial does not demand the input to sum to 1
277
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
278
+ # gather the corresponding indices
279
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
280
+ # append to the sequence
281
+ x = torch.cat((x, xcol), dim=1)
282
+
283
+ # print the generated text
284
+ for i in range(num_return_sequences):
285
+ tokens = x[i, :max_length].tolist()
286
+ decoded = enc.decode(tokens)
287
+ print(">", decoded)
CodeFiles/train_get2-9-speedup1.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Solving for residual std scaling issue
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ self.c_proj.NANGPT_SCALE_INIT = 1
22
+ # regularization
23
+ self.n_head = config.n_head
24
+ self.n_embd = config.n_embd
25
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
26
+
27
+ def forward(self, x):
28
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
29
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
30
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
31
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
32
+ qkv = self.c_attn(x)
33
+ q, k, v = qkv.split(self.n_embd, dim=2)
34
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
37
+
38
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
39
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
40
+ att = F.softmax(att, dim=-1)
41
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
42
+
43
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
44
+ # output projection
45
+ y = self.c_proj(y)
46
+ return y
47
+
48
+
49
+ class MLP(nn.Module):
50
+
51
+ def __init__(self, config):
52
+ super().__init__()
53
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
54
+ self.gelu = nn.GELU(approximate='tanh')
55
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
56
+ self.c_proj.NANOGPT_SCALE_INIT = 1
57
+
58
+ def forward(self, x):
59
+ x = self.c_fc(x)
60
+ x = self.gelu(x)
61
+ x = self.c_proj(x)
62
+ return x
63
+
64
+ class Block(nn.Module):
65
+
66
+ def __init__(self, config):
67
+ super().__init__()
68
+ self.ln_1 = nn.LayerNorm(config.n_embd)
69
+ self.attn = CausalSelfAttention(config)
70
+ self.ln_2 = nn.LayerNorm(config.n_embd)
71
+ self.mlp = MLP(config)
72
+
73
+ def forward(self, x):
74
+ x = x + self.attn(self.ln_1(x))
75
+ x = x + self.mlp(self.ln_2(x))
76
+ return x
77
+
78
+
79
+ @dataclass
80
+ class GPTConfig:
81
+ block_size: int = 1024 # max sequence length
82
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
83
+ n_layer: int = 12 # number of layers
84
+ n_head: int = 12 # number of heads
85
+ n_embd: int = 768 # embedding dimension
86
+
87
+
88
+ class GPT(nn.Module):
89
+
90
+ def __init__(self, config):
91
+ super().__init__()
92
+ self.config = config
93
+
94
+ self.transformer = nn.ModuleDict(dict(
95
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
96
+ wpe = nn.Embedding(config.block_size, config.n_embd),
97
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
98
+ ln_f = nn.LayerNorm(config.n_embd),
99
+ ))
100
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
101
+
102
+ # weight sharing
103
+ self.transformer.wte.weight = self.lm_head.weight
104
+
105
+ # weight initialization
106
+ self.apply(self._init_weights)
107
+
108
+ def _init_weights(self, module):
109
+ if isinstance(module, nn.Linear):
110
+ std = 0.02
111
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
112
+ std *= (2 * self.config.n_layer) ** -0.5
113
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
114
+ if module.bias is not None:
115
+ torch.nn.init.zeros_(module.bias)
116
+ elif isinstance(module, nn.Embedding):
117
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
118
+
119
+
120
+
121
+ def forward(self, idx, targets=None):
122
+ # idx is of shape (B, T)
123
+ B, T = idx.size()
124
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
125
+ # forward the token and posisition embeddings
126
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
127
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
128
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
129
+ x = tok_emb + pos_emb
130
+ # forward the blocks of the transformer
131
+ for block in self.transformer.h:
132
+ x = block(x)
133
+ # forward the final layernorm and the classifier
134
+ x = self.transformer.ln_f(x)
135
+ logits = self.lm_head(x) # (B, T, vocab_size)
136
+ loss = None
137
+ if targets is not None:
138
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
139
+ return logits, loss
140
+
141
+ @classmethod
142
+ def from_pretrained(cls, model_type):
143
+ """Loads pretrained GPT-2 model weights from huggingface"""
144
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
145
+ from transformers import GPT2LMHeadModel
146
+ print("loading weights from pretrained gpt: %s" % model_type)
147
+
148
+ # n_layer, n_head and n_embd are determined from model_type
149
+ config_args = {
150
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
151
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
152
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
153
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
154
+ }[model_type]
155
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
156
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
157
+ # create a from-scratch initialized minGPT model
158
+ config = GPTConfig(**config_args)
159
+ model = GPT(config)
160
+ sd = model.state_dict()
161
+ sd_keys = sd.keys()
162
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
163
+
164
+ # init a huggingface/transformers model
165
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
166
+ sd_hf = model_hf.state_dict()
167
+
168
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
169
+ sd_keys_hf = sd_hf.keys()
170
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
171
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
172
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
173
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
174
+ # this means that we have to transpose these weights when we import them
175
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
176
+ for k in sd_keys_hf:
177
+ if any(k.endswith(w) for w in transposed):
178
+ # special treatment for the Conv1D weights we need to transpose
179
+ assert sd_hf[k].shape[::-1] == sd[k].shape
180
+ with torch.no_grad():
181
+ sd[k].copy_(sd_hf[k].t())
182
+ else:
183
+ # vanilla copy over the other parameters
184
+ assert sd_hf[k].shape == sd[k].shape
185
+ with torch.no_grad():
186
+ sd[k].copy_(sd_hf[k])
187
+
188
+ return model
189
+
190
+ # model = GPT.from_pretrained('gpt2')
191
+
192
+ device = 'cpu'
193
+ if torch.cuda.is_available():
194
+ device = 'cuda'
195
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
196
+ device = "mps"
197
+ print(f"using device: {device}")
198
+
199
+ # SEED
200
+ torch.manual_seed(1337)
201
+ if torch.cuda.is_available():
202
+ torch.cuda.manual_seed(1337)
203
+
204
+ # STOP
205
+ num_return_sequences = 5
206
+ max_length = 30
207
+
208
+
209
+
210
+ import tiktoken
211
+
212
+ class DataLoaderLite:
213
+ def __init__(self, B, T):
214
+ self.B = B
215
+ self.T = T
216
+
217
+ # at init load tokens from disk and store them in memory
218
+ with open('input.txt', 'r') as f:
219
+ text = f.read()
220
+ enc = tiktoken.get_encoding('gpt2')
221
+ tokens = enc.encode(text)
222
+ self.tokens = torch.tensor(tokens)
223
+ print(f'loaded {len(self.tokens)} tokens')
224
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
225
+
226
+ # state
227
+ self.current_position = 0
228
+
229
+ def next_batch(self):
230
+ B, T = self.B, self.T
231
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
232
+ x = (buf[:-1]).view(B, T) # inputs
233
+ y = (buf[1:]).view(B, T) # targets
234
+ # advance the position in the tensor
235
+ self.current_position += B*T
236
+ # if loading the next batch would be out of bounds, reset
237
+ if self.current_position + (B * T + 1) > len(self.tokens):
238
+ self.current_position = 0
239
+ return x, y
240
+
241
+
242
+ model = GPT(GPTConfig())
243
+ model.to(device)
244
+
245
+ train_loader = DataLoaderLite(B = 8, T = 1024)
246
+
247
+ # NEW CODE
248
+ import time
249
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
250
+ for i in range(50):
251
+ t0 = time.time()
252
+ x, y = train_loader.next_batch()
253
+ x, y = x.to(device), y.to(device)
254
+ optimizer.zero_grad()
255
+ logits, loss = model(x, y)
256
+ loss.backward()
257
+ optimizer.step()
258
+ torch.cuda.synchronize()
259
+ t1 = time.time()
260
+ dt = (t1 - t0) * 1000
261
+ tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
262
+ print(f'step{i} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f}')
263
+
264
+
265
+ print(loss)
266
+ import sys; sys.exit(0)
267
+
268
+ torch.manual_seed(42)
269
+ torch.cuda.manual_seed(42)
270
+ while x.size(1) < max_length:
271
+ # forward the model to get the logits
272
+ with torch.no_grad():
273
+ logits = model(x)[0] # (B, T, vocab_size)
274
+ # take the logits at the last position
275
+ logits = logits[:, -1, :] # (B, vocab_size)
276
+ # get the probabilities
277
+ probs = F.softmax(logits, dim=-1)
278
+ # do top-k sampling of 50 (huggingface pipeline default)
279
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
280
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
281
+ # select a token from the top-k probabilities
282
+ # note: multinomial does not demand the input to sum to 1
283
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
284
+ # gather the corresponding indices
285
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
286
+ # append to the sequence
287
+ x = torch.cat((x, xcol), dim=1)
288
+
289
+ # print the generated text
290
+ for i in range(num_return_sequences):
291
+ tokens = x[i, :max_length].tolist()
292
+ decoded = enc.decode(tokens)
293
+ print(">", decoded)
CodeFiles/train_get2-9-speedup2.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Solving for residual std scaling issue
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ self.c_proj.NANGPT_SCALE_INIT = 1
22
+ # regularization
23
+ self.n_head = config.n_head
24
+ self.n_embd = config.n_embd
25
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
26
+
27
+ def forward(self, x):
28
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
29
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
30
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
31
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
32
+ qkv = self.c_attn(x)
33
+ q, k, v = qkv.split(self.n_embd, dim=2)
34
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
37
+
38
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
39
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
40
+ att = F.softmax(att, dim=-1)
41
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
42
+
43
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
44
+ # output projection
45
+ y = self.c_proj(y)
46
+ return y
47
+
48
+
49
+ class MLP(nn.Module):
50
+
51
+ def __init__(self, config):
52
+ super().__init__()
53
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
54
+ self.gelu = nn.GELU(approximate='tanh')
55
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
56
+ self.c_proj.NANOGPT_SCALE_INIT = 1
57
+
58
+ def forward(self, x):
59
+ x = self.c_fc(x)
60
+ x = self.gelu(x)
61
+ x = self.c_proj(x)
62
+ return x
63
+
64
+ class Block(nn.Module):
65
+
66
+ def __init__(self, config):
67
+ super().__init__()
68
+ self.ln_1 = nn.LayerNorm(config.n_embd)
69
+ self.attn = CausalSelfAttention(config)
70
+ self.ln_2 = nn.LayerNorm(config.n_embd)
71
+ self.mlp = MLP(config)
72
+
73
+ def forward(self, x):
74
+ x = x + self.attn(self.ln_1(x))
75
+ x = x + self.mlp(self.ln_2(x))
76
+ return x
77
+
78
+
79
+ @dataclass
80
+ class GPTConfig:
81
+ block_size: int = 1024 # max sequence length
82
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
83
+ n_layer: int = 12 # number of layers
84
+ n_head: int = 12 # number of heads
85
+ n_embd: int = 768 # embedding dimension
86
+
87
+
88
+ class GPT(nn.Module):
89
+
90
+ def __init__(self, config):
91
+ super().__init__()
92
+ self.config = config
93
+
94
+ self.transformer = nn.ModuleDict(dict(
95
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
96
+ wpe = nn.Embedding(config.block_size, config.n_embd),
97
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
98
+ ln_f = nn.LayerNorm(config.n_embd),
99
+ ))
100
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
101
+
102
+ # weight sharing
103
+ self.transformer.wte.weight = self.lm_head.weight
104
+
105
+ # weight initialization
106
+ self.apply(self._init_weights)
107
+
108
+ def _init_weights(self, module):
109
+ if isinstance(module, nn.Linear):
110
+ std = 0.02
111
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
112
+ std *= (2 * self.config.n_layer) ** -0.5
113
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
114
+ if module.bias is not None:
115
+ torch.nn.init.zeros_(module.bias)
116
+ elif isinstance(module, nn.Embedding):
117
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
118
+
119
+
120
+
121
+ def forward(self, idx, targets=None):
122
+ # idx is of shape (B, T)
123
+ B, T = idx.size()
124
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
125
+ # forward the token and posisition embeddings
126
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
127
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
128
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
129
+ x = tok_emb + pos_emb
130
+ # forward the blocks of the transformer
131
+ for block in self.transformer.h:
132
+ x = block(x)
133
+ # forward the final layernorm and the classifier
134
+ x = self.transformer.ln_f(x)
135
+ logits = self.lm_head(x) # (B, T, vocab_size)
136
+ loss = None
137
+ if targets is not None:
138
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
139
+ return logits, loss
140
+
141
+ @classmethod
142
+ def from_pretrained(cls, model_type):
143
+ """Loads pretrained GPT-2 model weights from huggingface"""
144
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
145
+ from transformers import GPT2LMHeadModel
146
+ print("loading weights from pretrained gpt: %s" % model_type)
147
+
148
+ # n_layer, n_head and n_embd are determined from model_type
149
+ config_args = {
150
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
151
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
152
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
153
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
154
+ }[model_type]
155
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
156
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
157
+ # create a from-scratch initialized minGPT model
158
+ config = GPTConfig(**config_args)
159
+ model = GPT(config)
160
+ sd = model.state_dict()
161
+ sd_keys = sd.keys()
162
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
163
+
164
+ # init a huggingface/transformers model
165
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
166
+ sd_hf = model_hf.state_dict()
167
+
168
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
169
+ sd_keys_hf = sd_hf.keys()
170
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
171
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
172
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
173
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
174
+ # this means that we have to transpose these weights when we import them
175
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
176
+ for k in sd_keys_hf:
177
+ if any(k.endswith(w) for w in transposed):
178
+ # special treatment for the Conv1D weights we need to transpose
179
+ assert sd_hf[k].shape[::-1] == sd[k].shape
180
+ with torch.no_grad():
181
+ sd[k].copy_(sd_hf[k].t())
182
+ else:
183
+ # vanilla copy over the other parameters
184
+ assert sd_hf[k].shape == sd[k].shape
185
+ with torch.no_grad():
186
+ sd[k].copy_(sd_hf[k])
187
+
188
+ return model
189
+
190
+ # model = GPT.from_pretrained('gpt2')
191
+
192
+ device = 'cpu'
193
+ if torch.cuda.is_available():
194
+ device = 'cuda'
195
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
196
+ device = "mps"
197
+ print(f"using device: {device}")
198
+
199
+ # SEED
200
+ torch.manual_seed(1337)
201
+ if torch.cuda.is_available():
202
+ torch.cuda.manual_seed(1337)
203
+
204
+ # STOP
205
+ num_return_sequences = 5
206
+ max_length = 30
207
+
208
+
209
+
210
+ import tiktoken
211
+
212
+ class DataLoaderLite:
213
+ def __init__(self, B, T):
214
+ self.B = B
215
+ self.T = T
216
+
217
+ # at init load tokens from disk and store them in memory
218
+ with open('input.txt', 'r') as f:
219
+ text = f.read()
220
+ enc = tiktoken.get_encoding('gpt2')
221
+ tokens = enc.encode(text)
222
+ self.tokens = torch.tensor(tokens)
223
+ print(f'loaded {len(self.tokens)} tokens')
224
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
225
+
226
+ # state
227
+ self.current_position = 0
228
+
229
+ def next_batch(self):
230
+ B, T = self.B, self.T
231
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
232
+ x = (buf[:-1]).view(B, T) # inputs
233
+ y = (buf[1:]).view(B, T) # targets
234
+ # advance the position in the tensor
235
+ self.current_position += B*T
236
+ # if loading the next batch would be out of bounds, reset
237
+ if self.current_position + (B * T + 1) > len(self.tokens):
238
+ self.current_position = 0
239
+ return x, y
240
+
241
+ # CHANGES IN CURRENT CODE
242
+ torch.set_float32_matmul_precision('high')
243
+
244
+ model = GPT(GPTConfig())
245
+ model.to(device)
246
+
247
+ train_loader = DataLoaderLite(B = 8, T = 1024)
248
+
249
+ # NEW CODE
250
+ import time
251
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
252
+ for i in range(50):
253
+ t0 = time.time()
254
+ x, y = train_loader.next_batch()
255
+ x, y = x.to(device), y.to(device)
256
+ optimizer.zero_grad()
257
+ logits, loss = model(x, y)
258
+ loss.backward()
259
+ optimizer.step()
260
+ torch.cuda.synchronize()
261
+ t1 = time.time()
262
+ dt = (t1 - t0) * 1000
263
+ tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
264
+ print(f'step{i} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f}')
265
+
266
+
267
+ print(loss)
268
+ import sys; sys.exit(0)
269
+
270
+ torch.manual_seed(42)
271
+ torch.cuda.manual_seed(42)
272
+ while x.size(1) < max_length:
273
+ # forward the model to get the logits
274
+ with torch.no_grad():
275
+ logits = model(x)[0] # (B, T, vocab_size)
276
+ # take the logits at the last position
277
+ logits = logits[:, -1, :] # (B, vocab_size)
278
+ # get the probabilities
279
+ probs = F.softmax(logits, dim=-1)
280
+ # do top-k sampling of 50 (huggingface pipeline default)
281
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
282
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
283
+ # select a token from the top-k probabilities
284
+ # note: multinomial does not demand the input to sum to 1
285
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
286
+ # gather the corresponding indices
287
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
288
+ # append to the sequence
289
+ x = torch.cat((x, xcol), dim=1)
290
+
291
+ # print the generated text
292
+ for i in range(num_return_sequences):
293
+ tokens = x[i, :max_length].tolist()
294
+ decoded = enc.decode(tokens)
295
+ print(">", decoded)
CodeFiles/train_get2-9-speedup3.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logits and Loss
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ self.c_proj.NANGPT_SCALE_INIT = 1
22
+ # regularization
23
+ self.n_head = config.n_head
24
+ self.n_embd = config.n_embd
25
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
26
+
27
+ def forward(self, x):
28
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
29
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
30
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
31
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
32
+ qkv = self.c_attn(x)
33
+ q, k, v = qkv.split(self.n_embd, dim=2)
34
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
37
+
38
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
39
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
40
+ att = F.softmax(att, dim=-1)
41
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
42
+
43
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
44
+ # output projection
45
+ y = self.c_proj(y)
46
+ return y
47
+
48
+
49
+ class MLP(nn.Module):
50
+
51
+ def __init__(self, config):
52
+ super().__init__()
53
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
54
+ self.gelu = nn.GELU(approximate='tanh')
55
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
56
+ self.c_proj.NANOGPT_SCALE_INIT = 1
57
+
58
+ def forward(self, x):
59
+ x = self.c_fc(x)
60
+ x = self.gelu(x)
61
+ x = self.c_proj(x)
62
+ return x
63
+
64
+ class Block(nn.Module):
65
+
66
+ def __init__(self, config):
67
+ super().__init__()
68
+ self.ln_1 = nn.LayerNorm(config.n_embd)
69
+ self.attn = CausalSelfAttention(config)
70
+ self.ln_2 = nn.LayerNorm(config.n_embd)
71
+ self.mlp = MLP(config)
72
+
73
+ def forward(self, x):
74
+ x = x + self.attn(self.ln_1(x))
75
+ x = x + self.mlp(self.ln_2(x))
76
+ return x
77
+
78
+
79
+ @dataclass
80
+ class GPTConfig:
81
+ block_size: int = 1024 # max sequence length
82
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
83
+ n_layer: int = 12 # number of layers
84
+ n_head: int = 12 # number of heads
85
+ n_embd: int = 768 # embedding dimension
86
+
87
+
88
+ class GPT(nn.Module):
89
+
90
+ def __init__(self, config):
91
+ super().__init__()
92
+ self.config = config
93
+
94
+ self.transformer = nn.ModuleDict(dict(
95
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
96
+ wpe = nn.Embedding(config.block_size, config.n_embd),
97
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
98
+ ln_f = nn.LayerNorm(config.n_embd),
99
+ ))
100
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
101
+
102
+ # weight sharing
103
+ self.transformer.wte.weight = self.lm_head.weight
104
+
105
+ # weight initialization
106
+ self.apply(self._init_weights)
107
+
108
+ def _init_weights(self, module):
109
+ if isinstance(module, nn.Linear):
110
+ std = 0.02
111
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
112
+ std *= (2 * self.config.n_layer) ** -0.5
113
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
114
+ if module.bias is not None:
115
+ torch.nn.init.zeros_(module.bias)
116
+ elif isinstance(module, nn.Embedding):
117
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
118
+
119
+
120
+
121
+ def forward(self, idx, targets=None):
122
+ # idx is of shape (B, T)
123
+ B, T = idx.size()
124
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
125
+ # forward the token and posisition embeddings
126
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
127
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
128
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
129
+ x = tok_emb + pos_emb
130
+ # forward the blocks of the transformer
131
+ for block in self.transformer.h:
132
+ x = block(x)
133
+ # forward the final layernorm and the classifier
134
+ x = self.transformer.ln_f(x)
135
+ logits = self.lm_head(x) # (B, T, vocab_size)
136
+ loss = None
137
+ if targets is not None:
138
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
139
+ return logits, loss
140
+
141
+ @classmethod
142
+ def from_pretrained(cls, model_type):
143
+ """Loads pretrained GPT-2 model weights from huggingface"""
144
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
145
+ from transformers import GPT2LMHeadModel
146
+ print("loading weights from pretrained gpt: %s" % model_type)
147
+
148
+ # n_layer, n_head and n_embd are determined from model_type
149
+ config_args = {
150
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
151
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
152
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
153
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
154
+ }[model_type]
155
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
156
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
157
+ # create a from-scratch initialized minGPT model
158
+ config = GPTConfig(**config_args)
159
+ model = GPT(config)
160
+ sd = model.state_dict()
161
+ sd_keys = sd.keys()
162
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
163
+
164
+ # init a huggingface/transformers model
165
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
166
+ sd_hf = model_hf.state_dict()
167
+
168
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
169
+ sd_keys_hf = sd_hf.keys()
170
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
171
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
172
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
173
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
174
+ # this means that we have to transpose these weights when we import them
175
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
176
+ for k in sd_keys_hf:
177
+ if any(k.endswith(w) for w in transposed):
178
+ # special treatment for the Conv1D weights we need to transpose
179
+ assert sd_hf[k].shape[::-1] == sd[k].shape
180
+ with torch.no_grad():
181
+ sd[k].copy_(sd_hf[k].t())
182
+ else:
183
+ # vanilla copy over the other parameters
184
+ assert sd_hf[k].shape == sd[k].shape
185
+ with torch.no_grad():
186
+ sd[k].copy_(sd_hf[k])
187
+
188
+ return model
189
+
190
+ # model = GPT.from_pretrained('gpt2')
191
+
192
+ device = 'cpu'
193
+ if torch.cuda.is_available():
194
+ device = 'cuda'
195
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
196
+ device = "mps"
197
+ print(f"using device: {device}")
198
+
199
+ # SEED
200
+ torch.manual_seed(1337)
201
+ if torch.cuda.is_available():
202
+ torch.cuda.manual_seed(1337)
203
+
204
+ # STOP
205
+ num_return_sequences = 5
206
+ max_length = 30
207
+
208
+
209
+
210
+ import tiktoken
211
+
212
+ class DataLoaderLite:
213
+ def __init__(self, B, T):
214
+ self.B = B
215
+ self.T = T
216
+
217
+ # at init load tokens from disk and store them in memory
218
+ with open('input.txt', 'r') as f:
219
+ text = f.read()
220
+ enc = tiktoken.get_encoding('gpt2')
221
+ tokens = enc.encode(text)
222
+ self.tokens = torch.tensor(tokens)
223
+ print(f'loaded {len(self.tokens)} tokens')
224
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
225
+
226
+ # state
227
+ self.current_position = 0
228
+
229
+ def next_batch(self):
230
+ B, T = self.B, self.T
231
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
232
+ x = (buf[:-1]).view(B, T) # inputs
233
+ y = (buf[1:]).view(B, T) # targets
234
+ # advance the position in the tensor
235
+ self.current_position += B*T
236
+ # if loading the next batch would be out of bounds, reset
237
+ if self.current_position + (B * T + 1) > len(self.tokens):
238
+ self.current_position = 0
239
+ return x, y
240
+
241
+ # CHANGES IN CURRENT CODE
242
+ torch.set_float32_matmul_precision('high')
243
+
244
+ model = GPT(GPTConfig())
245
+ model.to(device)
246
+
247
+ train_loader = DataLoaderLite(B = 8, T = 1024)
248
+
249
+ # NEW CODE
250
+ import time
251
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
252
+ for i in range(50):
253
+ t0 = time.time()
254
+ x, y = train_loader.next_batch()
255
+ x, y = x.to(device), y.to(device)
256
+ optimizer.zero_grad()
257
+ # NEW CODE ADDED HERE
258
+ with torch.autocast(device_type=device, dtype=torch.bfloat16):
259
+ logits, loss = model(x, y)
260
+ loss.backward()
261
+ optimizer.step()
262
+ torch.cuda.synchronize()
263
+ t1 = time.time()
264
+ dt = (t1 - t0) * 1000
265
+ tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
266
+ print(f'step{i} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f}')
267
+
268
+
269
+ print(loss)
270
+ import sys; sys.exit(0)
271
+
272
+ torch.manual_seed(42)
273
+ torch.cuda.manual_seed(42)
274
+ while x.size(1) < max_length:
275
+ # forward the model to get the logits
276
+ with torch.no_grad():
277
+ logits = model(x)[0] # (B, T, vocab_size)
278
+ # take the logits at the last position
279
+ logits = logits[:, -1, :] # (B, vocab_size)
280
+ # get the probabilities
281
+ probs = F.softmax(logits, dim=-1)
282
+ # do top-k sampling of 50 (huggingface pipeline default)
283
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
284
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
285
+ # select a token from the top-k probabilities
286
+ # note: multinomial does not demand the input to sum to 1
287
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
288
+ # gather the corresponding indices
289
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
290
+ # append to the sequence
291
+ x = torch.cat((x, xcol), dim=1)
292
+
293
+ # print the generated text
294
+ for i in range(num_return_sequences):
295
+ tokens = x[i, :max_length].tolist()
296
+ decoded = enc.decode(tokens)
297
+ print(">", decoded)
CodeFiles/train_get2-9-speedup4.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # torch.compile
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ self.c_proj.NANGPT_SCALE_INIT = 1
22
+ # regularization
23
+ self.n_head = config.n_head
24
+ self.n_embd = config.n_embd
25
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
26
+
27
+ def forward(self, x):
28
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
29
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
30
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
31
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
32
+ qkv = self.c_attn(x)
33
+ q, k, v = qkv.split(self.n_embd, dim=2)
34
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
37
+
38
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
39
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
40
+ att = F.softmax(att, dim=-1)
41
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
42
+
43
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
44
+ # output projection
45
+ y = self.c_proj(y)
46
+ return y
47
+
48
+
49
+ class MLP(nn.Module):
50
+
51
+ def __init__(self, config):
52
+ super().__init__()
53
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
54
+ self.gelu = nn.GELU(approximate='tanh')
55
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
56
+ self.c_proj.NANOGPT_SCALE_INIT = 1
57
+
58
+ def forward(self, x):
59
+ x = self.c_fc(x)
60
+ x = self.gelu(x)
61
+ x = self.c_proj(x)
62
+ return x
63
+
64
+ class Block(nn.Module):
65
+
66
+ def __init__(self, config):
67
+ super().__init__()
68
+ self.ln_1 = nn.LayerNorm(config.n_embd)
69
+ self.attn = CausalSelfAttention(config)
70
+ self.ln_2 = nn.LayerNorm(config.n_embd)
71
+ self.mlp = MLP(config)
72
+
73
+ def forward(self, x):
74
+ x = x + self.attn(self.ln_1(x))
75
+ x = x + self.mlp(self.ln_2(x))
76
+ return x
77
+
78
+
79
+ @dataclass
80
+ class GPTConfig:
81
+ block_size: int = 1024 # max sequence length
82
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
83
+ n_layer: int = 12 # number of layers
84
+ n_head: int = 12 # number of heads
85
+ n_embd: int = 768 # embedding dimension
86
+
87
+
88
+ class GPT(nn.Module):
89
+
90
+ def __init__(self, config):
91
+ super().__init__()
92
+ self.config = config
93
+
94
+ self.transformer = nn.ModuleDict(dict(
95
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
96
+ wpe = nn.Embedding(config.block_size, config.n_embd),
97
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
98
+ ln_f = nn.LayerNorm(config.n_embd),
99
+ ))
100
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
101
+
102
+ # weight sharing
103
+ self.transformer.wte.weight = self.lm_head.weight
104
+
105
+ # weight initialization
106
+ self.apply(self._init_weights)
107
+
108
+ def _init_weights(self, module):
109
+ if isinstance(module, nn.Linear):
110
+ std = 0.02
111
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
112
+ std *= (2 * self.config.n_layer) ** -0.5
113
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
114
+ if module.bias is not None:
115
+ torch.nn.init.zeros_(module.bias)
116
+ elif isinstance(module, nn.Embedding):
117
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
118
+
119
+
120
+
121
+ def forward(self, idx, targets=None):
122
+ # idx is of shape (B, T)
123
+ B, T = idx.size()
124
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
125
+ # forward the token and posisition embeddings
126
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
127
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
128
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
129
+ x = tok_emb + pos_emb
130
+ # forward the blocks of the transformer
131
+ for block in self.transformer.h:
132
+ x = block(x)
133
+ # forward the final layernorm and the classifier
134
+ x = self.transformer.ln_f(x)
135
+ logits = self.lm_head(x) # (B, T, vocab_size)
136
+ loss = None
137
+ if targets is not None:
138
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
139
+ return logits, loss
140
+
141
+ @classmethod
142
+ def from_pretrained(cls, model_type):
143
+ """Loads pretrained GPT-2 model weights from huggingface"""
144
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
145
+ from transformers import GPT2LMHeadModel
146
+ print("loading weights from pretrained gpt: %s" % model_type)
147
+
148
+ # n_layer, n_head and n_embd are determined from model_type
149
+ config_args = {
150
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
151
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
152
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
153
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
154
+ }[model_type]
155
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
156
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
157
+ # create a from-scratch initialized minGPT model
158
+ config = GPTConfig(**config_args)
159
+ model = GPT(config)
160
+ sd = model.state_dict()
161
+ sd_keys = sd.keys()
162
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
163
+
164
+ # init a huggingface/transformers model
165
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
166
+ sd_hf = model_hf.state_dict()
167
+
168
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
169
+ sd_keys_hf = sd_hf.keys()
170
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
171
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
172
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
173
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
174
+ # this means that we have to transpose these weights when we import them
175
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
176
+ for k in sd_keys_hf:
177
+ if any(k.endswith(w) for w in transposed):
178
+ # special treatment for the Conv1D weights we need to transpose
179
+ assert sd_hf[k].shape[::-1] == sd[k].shape
180
+ with torch.no_grad():
181
+ sd[k].copy_(sd_hf[k].t())
182
+ else:
183
+ # vanilla copy over the other parameters
184
+ assert sd_hf[k].shape == sd[k].shape
185
+ with torch.no_grad():
186
+ sd[k].copy_(sd_hf[k])
187
+
188
+ return model
189
+
190
+ # model = GPT.from_pretrained('gpt2')
191
+
192
+ device = 'cpu'
193
+ if torch.cuda.is_available():
194
+ device = 'cuda'
195
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
196
+ device = "mps"
197
+ print(f"using device: {device}")
198
+
199
+ # SEED
200
+ torch.manual_seed(1337)
201
+ if torch.cuda.is_available():
202
+ torch.cuda.manual_seed(1337)
203
+
204
+ # STOP
205
+ num_return_sequences = 5
206
+ max_length = 30
207
+
208
+
209
+
210
+ import tiktoken
211
+
212
+ class DataLoaderLite:
213
+ def __init__(self, B, T):
214
+ self.B = B
215
+ self.T = T
216
+
217
+ # at init load tokens from disk and store them in memory
218
+ with open('input.txt', 'r') as f:
219
+ text = f.read()
220
+ enc = tiktoken.get_encoding('gpt2')
221
+ tokens = enc.encode(text)
222
+ self.tokens = torch.tensor(tokens)
223
+ print(f'loaded {len(self.tokens)} tokens')
224
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
225
+
226
+ # state
227
+ self.current_position = 0
228
+
229
+ def next_batch(self):
230
+ B, T = self.B, self.T
231
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
232
+ x = (buf[:-1]).view(B, T) # inputs
233
+ y = (buf[1:]).view(B, T) # targets
234
+ # advance the position in the tensor
235
+ self.current_position += B*T
236
+ # if loading the next batch would be out of bounds, reset
237
+ if self.current_position + (B * T + 1) > len(self.tokens):
238
+ self.current_position = 0
239
+ return x, y
240
+
241
+ # CHANGES IN CURRENT CODE
242
+ torch.set_float32_matmul_precision('high')
243
+
244
+ model = GPT(GPTConfig())
245
+ model.to(device)
246
+ model = torch.compile(model)
247
+
248
+ train_loader = DataLoaderLite(B = 8, T = 1024)
249
+
250
+ # NEW CODE
251
+ import time
252
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
253
+ for i in range(50):
254
+ t0 = time.time()
255
+ x, y = train_loader.next_batch()
256
+ x, y = x.to(device), y.to(device)
257
+ optimizer.zero_grad()
258
+ # NEW CODE ADDED HERE
259
+ with torch.autocast(device_type=device, dtype=torch.bfloat16):
260
+ logits, loss = model(x, y)
261
+ loss.backward()
262
+ optimizer.step()
263
+ torch.cuda.synchronize()
264
+ t1 = time.time()
265
+ dt = (t1 - t0) * 1000
266
+ tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
267
+ print(f'step{i} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f}')
268
+
269
+
270
+ print(loss)
271
+ import sys; sys.exit(0)
272
+
273
+ torch.manual_seed(42)
274
+ torch.cuda.manual_seed(42)
275
+ while x.size(1) < max_length:
276
+ # forward the model to get the logits
277
+ with torch.no_grad():
278
+ logits = model(x)[0] # (B, T, vocab_size)
279
+ # take the logits at the last position
280
+ logits = logits[:, -1, :] # (B, vocab_size)
281
+ # get the probabilities
282
+ probs = F.softmax(logits, dim=-1)
283
+ # do top-k sampling of 50 (huggingface pipeline default)
284
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
285
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
286
+ # select a token from the top-k probabilities
287
+ # note: multinomial does not demand the input to sum to 1
288
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
289
+ # gather the corresponding indices
290
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
291
+ # append to the sequence
292
+ x = torch.cat((x, xcol), dim=1)
293
+
294
+ # print the generated text
295
+ for i in range(num_return_sequences):
296
+ tokens = x[i, :max_length].tolist()
297
+ decoded = enc.decode(tokens)
298
+ print(">", decoded)
CodeFiles/train_get2-9-speedup5.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Flash Attention
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ self.c_proj.NANGPT_SCALE_INIT = 1
22
+ # regularization
23
+ self.n_head = config.n_head
24
+ self.n_embd = config.n_embd
25
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
26
+
27
+ def forward(self, x):
28
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
29
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
30
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
31
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
32
+ qkv = self.c_attn(x)
33
+ q, k, v = qkv.split(self.n_embd, dim=2)
34
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
37
+
38
+ # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
39
+ # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
40
+ # att = F.softmax(att, dim=-1)
41
+ # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
42
+
43
+ y = F.scaled_dot_product_attention(q, k, v, is_causal = True) # Flash attention
44
+
45
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
46
+ # output projection
47
+ y = self.c_proj(y)
48
+ return y
49
+
50
+
51
+ class MLP(nn.Module):
52
+
53
+ def __init__(self, config):
54
+ super().__init__()
55
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
56
+ self.gelu = nn.GELU(approximate='tanh')
57
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
58
+ self.c_proj.NANOGPT_SCALE_INIT = 1
59
+
60
+ def forward(self, x):
61
+ x = self.c_fc(x)
62
+ x = self.gelu(x)
63
+ x = self.c_proj(x)
64
+ return x
65
+
66
+ class Block(nn.Module):
67
+
68
+ def __init__(self, config):
69
+ super().__init__()
70
+ self.ln_1 = nn.LayerNorm(config.n_embd)
71
+ self.attn = CausalSelfAttention(config)
72
+ self.ln_2 = nn.LayerNorm(config.n_embd)
73
+ self.mlp = MLP(config)
74
+
75
+ def forward(self, x):
76
+ x = x + self.attn(self.ln_1(x))
77
+ x = x + self.mlp(self.ln_2(x))
78
+ return x
79
+
80
+
81
+ @dataclass
82
+ class GPTConfig:
83
+ block_size: int = 1024 # max sequence length
84
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
85
+ n_layer: int = 12 # number of layers
86
+ n_head: int = 12 # number of heads
87
+ n_embd: int = 768 # embedding dimension
88
+
89
+
90
+ class GPT(nn.Module):
91
+
92
+ def __init__(self, config):
93
+ super().__init__()
94
+ self.config = config
95
+
96
+ self.transformer = nn.ModuleDict(dict(
97
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
98
+ wpe = nn.Embedding(config.block_size, config.n_embd),
99
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
100
+ ln_f = nn.LayerNorm(config.n_embd),
101
+ ))
102
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
103
+
104
+ # weight sharing
105
+ self.transformer.wte.weight = self.lm_head.weight
106
+
107
+ # weight initialization
108
+ self.apply(self._init_weights)
109
+
110
+ def _init_weights(self, module):
111
+ if isinstance(module, nn.Linear):
112
+ std = 0.02
113
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
114
+ std *= (2 * self.config.n_layer) ** -0.5
115
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
116
+ if module.bias is not None:
117
+ torch.nn.init.zeros_(module.bias)
118
+ elif isinstance(module, nn.Embedding):
119
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
120
+
121
+
122
+
123
+ def forward(self, idx, targets=None):
124
+ # idx is of shape (B, T)
125
+ B, T = idx.size()
126
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
127
+ # forward the token and posisition embeddings
128
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
129
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
130
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
131
+ x = tok_emb + pos_emb
132
+ # forward the blocks of the transformer
133
+ for block in self.transformer.h:
134
+ x = block(x)
135
+ # forward the final layernorm and the classifier
136
+ x = self.transformer.ln_f(x)
137
+ logits = self.lm_head(x) # (B, T, vocab_size)
138
+ loss = None
139
+ if targets is not None:
140
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
141
+ return logits, loss
142
+
143
+ @classmethod
144
+ def from_pretrained(cls, model_type):
145
+ """Loads pretrained GPT-2 model weights from huggingface"""
146
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
147
+ from transformers import GPT2LMHeadModel
148
+ print("loading weights from pretrained gpt: %s" % model_type)
149
+
150
+ # n_layer, n_head and n_embd are determined from model_type
151
+ config_args = {
152
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
153
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
154
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
155
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
156
+ }[model_type]
157
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
158
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
159
+ # create a from-scratch initialized minGPT model
160
+ config = GPTConfig(**config_args)
161
+ model = GPT(config)
162
+ sd = model.state_dict()
163
+ sd_keys = sd.keys()
164
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
165
+
166
+ # init a huggingface/transformers model
167
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
168
+ sd_hf = model_hf.state_dict()
169
+
170
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
171
+ sd_keys_hf = sd_hf.keys()
172
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
173
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
174
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
175
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
176
+ # this means that we have to transpose these weights when we import them
177
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
178
+ for k in sd_keys_hf:
179
+ if any(k.endswith(w) for w in transposed):
180
+ # special treatment for the Conv1D weights we need to transpose
181
+ assert sd_hf[k].shape[::-1] == sd[k].shape
182
+ with torch.no_grad():
183
+ sd[k].copy_(sd_hf[k].t())
184
+ else:
185
+ # vanilla copy over the other parameters
186
+ assert sd_hf[k].shape == sd[k].shape
187
+ with torch.no_grad():
188
+ sd[k].copy_(sd_hf[k])
189
+
190
+ return model
191
+
192
+ # model = GPT.from_pretrained('gpt2')
193
+
194
+ device = 'cpu'
195
+ if torch.cuda.is_available():
196
+ device = 'cuda'
197
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
198
+ device = "mps"
199
+ print(f"using device: {device}")
200
+
201
+ # SEED
202
+ torch.manual_seed(1337)
203
+ if torch.cuda.is_available():
204
+ torch.cuda.manual_seed(1337)
205
+
206
+ # STOP
207
+ num_return_sequences = 5
208
+ max_length = 30
209
+
210
+
211
+
212
+ import tiktoken
213
+
214
+ class DataLoaderLite:
215
+ def __init__(self, B, T):
216
+ self.B = B
217
+ self.T = T
218
+
219
+ # at init load tokens from disk and store them in memory
220
+ with open('input.txt', 'r') as f:
221
+ text = f.read()
222
+ enc = tiktoken.get_encoding('gpt2')
223
+ tokens = enc.encode(text)
224
+ self.tokens = torch.tensor(tokens)
225
+ print(f'loaded {len(self.tokens)} tokens')
226
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
227
+
228
+ # state
229
+ self.current_position = 0
230
+
231
+ def next_batch(self):
232
+ B, T = self.B, self.T
233
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
234
+ x = (buf[:-1]).view(B, T) # inputs
235
+ y = (buf[1:]).view(B, T) # targets
236
+ # advance the position in the tensor
237
+ self.current_position += B*T
238
+ # if loading the next batch would be out of bounds, reset
239
+ if self.current_position + (B * T + 1) > len(self.tokens):
240
+ self.current_position = 0
241
+ return x, y
242
+
243
+ # CHANGES IN CURRENT CODE
244
+ torch.set_float32_matmul_precision('high')
245
+
246
+ model = GPT(GPTConfig())
247
+ model.to(device)
248
+ # model = torch.compile(model)
249
+
250
+ train_loader = DataLoaderLite(B = 16, T = 1024)
251
+
252
+ # NEW CODE
253
+ import time
254
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
255
+ for i in range(50):
256
+ t0 = time.time()
257
+ x, y = train_loader.next_batch()
258
+ x, y = x.to(device), y.to(device)
259
+ optimizer.zero_grad()
260
+ # NEW CODE ADDED HERE
261
+ with torch.autocast(device_type=device, dtype=torch.bfloat16):
262
+ logits, loss = model(x, y)
263
+ loss.backward()
264
+ optimizer.step()
265
+ torch.cuda.synchronize()
266
+ t1 = time.time()
267
+ dt = (t1 - t0) * 1000
268
+ tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
269
+ print(f'step{i} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f}')
270
+
271
+
272
+ print(loss)
273
+ import sys; sys.exit(0)
274
+
275
+ torch.manual_seed(42)
276
+ torch.cuda.manual_seed(42)
277
+ while x.size(1) < max_length:
278
+ # forward the model to get the logits
279
+ with torch.no_grad():
280
+ logits = model(x)[0] # (B, T, vocab_size)
281
+ # take the logits at the last position
282
+ logits = logits[:, -1, :] # (B, vocab_size)
283
+ # get the probabilities
284
+ probs = F.softmax(logits, dim=-1)
285
+ # do top-k sampling of 50 (huggingface pipeline default)
286
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
287
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
288
+ # select a token from the top-k probabilities
289
+ # note: multinomial does not demand the input to sum to 1
290
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
291
+ # gather the corresponding indices
292
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
293
+ # append to the sequence
294
+ x = torch.cat((x, xcol), dim=1)
295
+
296
+ # print the generated text
297
+ for i in range(num_return_sequences):
298
+ tokens = x[i, :max_length].tolist()
299
+ decoded = enc.decode(tokens)
300
+ print(">", decoded)
CodeFiles/train_get2-9-speedup6.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # POwer of 2
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+
11
+
12
+ class CausalSelfAttention(nn.Module):
13
+
14
+ def __init__(self, config):
15
+ super().__init__()
16
+ assert config.n_embd % config.n_head == 0
17
+ # key, query, value projections for all heads, but in a batch
18
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
19
+ # output projection
20
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
21
+ self.c_proj.NANGPT_SCALE_INIT = 1
22
+ # regularization
23
+ self.n_head = config.n_head
24
+ self.n_embd = config.n_embd
25
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
26
+
27
+ def forward(self, x):
28
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
29
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
30
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
31
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
32
+ qkv = self.c_attn(x)
33
+ q, k, v = qkv.split(self.n_embd, dim=2)
34
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
35
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
37
+
38
+ # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
39
+ # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
40
+ # att = F.softmax(att, dim=-1)
41
+ # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
42
+
43
+ y = F.scaled_dot_product_attention(q, k, v, is_causal = True) # Flash attention
44
+
45
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
46
+ # output projection
47
+ y = self.c_proj(y)
48
+ return y
49
+
50
+
51
+ class MLP(nn.Module):
52
+
53
+ def __init__(self, config):
54
+ super().__init__()
55
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
56
+ self.gelu = nn.GELU(approximate='tanh')
57
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
58
+ self.c_proj.NANOGPT_SCALE_INIT = 1
59
+
60
+ def forward(self, x):
61
+ x = self.c_fc(x)
62
+ x = self.gelu(x)
63
+ x = self.c_proj(x)
64
+ return x
65
+
66
+ class Block(nn.Module):
67
+
68
+ def __init__(self, config):
69
+ super().__init__()
70
+ self.ln_1 = nn.LayerNorm(config.n_embd)
71
+ self.attn = CausalSelfAttention(config)
72
+ self.ln_2 = nn.LayerNorm(config.n_embd)
73
+ self.mlp = MLP(config)
74
+
75
+ def forward(self, x):
76
+ x = x + self.attn(self.ln_1(x))
77
+ x = x + self.mlp(self.ln_2(x))
78
+ return x
79
+
80
+
81
+ @dataclass
82
+ class GPTConfig:
83
+ block_size: int = 1024 # max sequence length
84
+ vocab_size: int = 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
85
+ n_layer: int = 12 # number of layers
86
+ n_head: int = 12 # number of heads
87
+ n_embd: int = 768 # embedding dimension
88
+
89
+
90
+ class GPT(nn.Module):
91
+
92
+ def __init__(self, config):
93
+ super().__init__()
94
+ self.config = config
95
+
96
+ self.transformer = nn.ModuleDict(dict(
97
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
98
+ wpe = nn.Embedding(config.block_size, config.n_embd),
99
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
100
+ ln_f = nn.LayerNorm(config.n_embd),
101
+ ))
102
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
103
+
104
+ # weight sharing
105
+ self.transformer.wte.weight = self.lm_head.weight
106
+
107
+ # weight initialization
108
+ self.apply(self._init_weights)
109
+
110
+ def _init_weights(self, module):
111
+ if isinstance(module, nn.Linear):
112
+ std = 0.02
113
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
114
+ std *= (2 * self.config.n_layer) ** -0.5
115
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
116
+ if module.bias is not None:
117
+ torch.nn.init.zeros_(module.bias)
118
+ elif isinstance(module, nn.Embedding):
119
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
120
+
121
+
122
+
123
+ def forward(self, idx, targets=None):
124
+ # idx is of shape (B, T)
125
+ B, T = idx.size()
126
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
127
+ # forward the token and posisition embeddings
128
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
129
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
130
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
131
+ x = tok_emb + pos_emb
132
+ # forward the blocks of the transformer
133
+ for block in self.transformer.h:
134
+ x = block(x)
135
+ # forward the final layernorm and the classifier
136
+ x = self.transformer.ln_f(x)
137
+ logits = self.lm_head(x) # (B, T, vocab_size)
138
+ loss = None
139
+ if targets is not None:
140
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
141
+ return logits, loss
142
+
143
+ @classmethod
144
+ def from_pretrained(cls, model_type):
145
+ """Loads pretrained GPT-2 model weights from huggingface"""
146
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
147
+ from transformers import GPT2LMHeadModel
148
+ print("loading weights from pretrained gpt: %s" % model_type)
149
+
150
+ # n_layer, n_head and n_embd are determined from model_type
151
+ config_args = {
152
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
153
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
154
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
155
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
156
+ }[model_type]
157
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
158
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
159
+ # create a from-scratch initialized minGPT model
160
+ config = GPTConfig(**config_args)
161
+ model = GPT(config)
162
+ sd = model.state_dict()
163
+ sd_keys = sd.keys()
164
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
165
+
166
+ # init a huggingface/transformers model
167
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
168
+ sd_hf = model_hf.state_dict()
169
+
170
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
171
+ sd_keys_hf = sd_hf.keys()
172
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
173
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
174
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
175
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
176
+ # this means that we have to transpose these weights when we import them
177
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
178
+ for k in sd_keys_hf:
179
+ if any(k.endswith(w) for w in transposed):
180
+ # special treatment for the Conv1D weights we need to transpose
181
+ assert sd_hf[k].shape[::-1] == sd[k].shape
182
+ with torch.no_grad():
183
+ sd[k].copy_(sd_hf[k].t())
184
+ else:
185
+ # vanilla copy over the other parameters
186
+ assert sd_hf[k].shape == sd[k].shape
187
+ with torch.no_grad():
188
+ sd[k].copy_(sd_hf[k])
189
+
190
+ return model
191
+
192
+ # model = GPT.from_pretrained('gpt2')
193
+
194
+ device = 'cpu'
195
+ if torch.cuda.is_available():
196
+ device = 'cuda'
197
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
198
+ device = "mps"
199
+ print(f"using device: {device}")
200
+
201
+ # SEED
202
+ torch.manual_seed(1337)
203
+ if torch.cuda.is_available():
204
+ torch.cuda.manual_seed(1337)
205
+
206
+ # STOP
207
+ num_return_sequences = 5
208
+ max_length = 30
209
+
210
+
211
+
212
+ import tiktoken
213
+
214
+ class DataLoaderLite:
215
+ def __init__(self, B, T):
216
+ self.B = B
217
+ self.T = T
218
+
219
+ # at init load tokens from disk and store them in memory
220
+ with open('input.txt', 'r') as f:
221
+ text = f.read()
222
+ enc = tiktoken.get_encoding('gpt2')
223
+ tokens = enc.encode(text)
224
+ self.tokens = torch.tensor(tokens)
225
+ print(f'loaded {len(self.tokens)} tokens')
226
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
227
+
228
+ # state
229
+ self.current_position = 0
230
+
231
+ def next_batch(self):
232
+ B, T = self.B, self.T
233
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
234
+ x = (buf[:-1]).view(B, T) # inputs
235
+ y = (buf[1:]).view(B, T) # targets
236
+ # advance the position in the tensor
237
+ self.current_position += B*T
238
+ # if loading the next batch would be out of bounds, reset
239
+ if self.current_position + (B * T + 1) > len(self.tokens):
240
+ self.current_position = 0
241
+ return x, y
242
+
243
+ # CHANGES IN CURRENT CODE
244
+ torch.set_float32_matmul_precision('high')
245
+
246
+ model = GPT(GPTConfig())
247
+ model.to(device)
248
+ # model = torch.compile(model)
249
+
250
+ train_loader = DataLoaderLite(B = 16, T = 1024)
251
+
252
+ # NEW CODE
253
+ import time
254
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
255
+ for i in range(50):
256
+ t0 = time.time()
257
+ x, y = train_loader.next_batch()
258
+ x, y = x.to(device), y.to(device)
259
+ optimizer.zero_grad()
260
+ # NEW CODE ADDED HERE
261
+ with torch.autocast(device_type=device, dtype=torch.bfloat16):
262
+ logits, loss = model(x, y)
263
+ loss.backward()
264
+ optimizer.step()
265
+ torch.cuda.synchronize()
266
+ t1 = time.time()
267
+ dt = (t1 - t0) * 1000
268
+ tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
269
+ print(f'step{i} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f}')
270
+
271
+
272
+ print(loss)
273
+ import sys; sys.exit(0)
274
+
275
+ torch.manual_seed(42)
276
+ torch.cuda.manual_seed(42)
277
+ while x.size(1) < max_length:
278
+ # forward the model to get the logits
279
+ with torch.no_grad():
280
+ logits = model(x)[0] # (B, T, vocab_size)
281
+ # take the logits at the last position
282
+ logits = logits[:, -1, :] # (B, vocab_size)
283
+ # get the probabilities
284
+ probs = F.softmax(logits, dim=-1)
285
+ # do top-k sampling of 50 (huggingface pipeline default)
286
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
287
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
288
+ # select a token from the top-k probabilities
289
+ # note: multinomial does not demand the input to sum to 1
290
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
291
+ # gather the corresponding indices
292
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
293
+ # append to the sequence
294
+ x = torch.cat((x, xcol), dim=1)
295
+
296
+ # print the generated text
297
+ for i in range(num_return_sequences):
298
+ tokens = x[i, :max_length].tolist()
299
+ decoded = enc.decode(tokens)
300
+ print(">", decoded)
CodeFiles/train_get2-9-speedup7.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-3 Paper
2
+ # model training, hyper-parameters
3
+ # Adam W
4
+ # gradient clipping.
5
+ import os
6
+ import math
7
+ import time
8
+ import inspect
9
+ from dataclasses import dataclass
10
+ import torch
11
+ import torch.nn as nn
12
+ from torch.nn import functional as F
13
+
14
+
15
+ class CausalSelfAttention(nn.Module):
16
+
17
+ def __init__(self, config):
18
+ super().__init__()
19
+ assert config.n_embd % config.n_head == 0
20
+ # key, query, value projections for all heads, but in a batch
21
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
22
+ # output projection
23
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
24
+ self.c_proj.NANGPT_SCALE_INIT = 1
25
+ # regularization
26
+ self.n_head = config.n_head
27
+ self.n_embd = config.n_embd
28
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
29
+
30
+ def forward(self, x):
31
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
32
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
33
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
34
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
35
+ qkv = self.c_attn(x)
36
+ q, k, v = qkv.split(self.n_embd, dim=2)
37
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
38
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
39
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
40
+
41
+ # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
42
+ # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
43
+ # att = F.softmax(att, dim=-1)
44
+ # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
45
+
46
+ y = F.scaled_dot_product_attention(q, k, v, is_causal = True) # Flash attention
47
+
48
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
49
+ # output projection
50
+ y = self.c_proj(y)
51
+ return y
52
+
53
+
54
+ class MLP(nn.Module):
55
+
56
+ def __init__(self, config):
57
+ super().__init__()
58
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
59
+ self.gelu = nn.GELU(approximate='tanh')
60
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
61
+ self.c_proj.NANOGPT_SCALE_INIT = 1
62
+
63
+ def forward(self, x):
64
+ x = self.c_fc(x)
65
+ x = self.gelu(x)
66
+ x = self.c_proj(x)
67
+ return x
68
+
69
+ class Block(nn.Module):
70
+
71
+ def __init__(self, config):
72
+ super().__init__()
73
+ self.ln_1 = nn.LayerNorm(config.n_embd)
74
+ self.attn = CausalSelfAttention(config)
75
+ self.ln_2 = nn.LayerNorm(config.n_embd)
76
+ self.mlp = MLP(config)
77
+
78
+ def forward(self, x):
79
+ x = x + self.attn(self.ln_1(x))
80
+ x = x + self.mlp(self.ln_2(x))
81
+ return x
82
+
83
+
84
+ @dataclass
85
+ class GPTConfig:
86
+ block_size: int = 1024 # max sequence length
87
+ vocab_size: int = 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
88
+ n_layer: int = 12 # number of layers
89
+ n_head: int = 12 # number of heads
90
+ n_embd: int = 768 # embedding dimension
91
+
92
+
93
+ class GPT(nn.Module):
94
+
95
+ def __init__(self, config):
96
+ super().__init__()
97
+ self.config = config
98
+
99
+ self.transformer = nn.ModuleDict(dict(
100
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
101
+ wpe = nn.Embedding(config.block_size, config.n_embd),
102
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
103
+ ln_f = nn.LayerNorm(config.n_embd),
104
+ ))
105
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
106
+
107
+ # weight sharing
108
+ self.transformer.wte.weight = self.lm_head.weight
109
+
110
+ # weight initialization
111
+ self.apply(self._init_weights)
112
+
113
+ def _init_weights(self, module):
114
+ if isinstance(module, nn.Linear):
115
+ std = 0.02
116
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
117
+ std *= (2 * self.config.n_layer) ** -0.5
118
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
119
+ if module.bias is not None:
120
+ torch.nn.init.zeros_(module.bias)
121
+ elif isinstance(module, nn.Embedding):
122
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
123
+
124
+
125
+
126
+ def forward(self, idx, targets=None):
127
+ # idx is of shape (B, T)
128
+ B, T = idx.size()
129
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
130
+ # forward the token and posisition embeddings
131
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
132
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
133
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
134
+ x = tok_emb + pos_emb
135
+ # forward the blocks of the transformer
136
+ for block in self.transformer.h:
137
+ x = block(x)
138
+ # forward the final layernorm and the classifier
139
+ x = self.transformer.ln_f(x)
140
+ logits = self.lm_head(x) # (B, T, vocab_size)
141
+ loss = None
142
+ if targets is not None:
143
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
144
+ return logits, loss
145
+
146
+ @classmethod
147
+ def from_pretrained(cls, model_type):
148
+ """Loads pretrained GPT-2 model weights from huggingface"""
149
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
150
+ from transformers import GPT2LMHeadModel
151
+ print("loading weights from pretrained gpt: %s" % model_type)
152
+
153
+ # n_layer, n_head and n_embd are determined from model_type
154
+ config_args = {
155
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
156
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
157
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
158
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
159
+ }[model_type]
160
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
161
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
162
+ # create a from-scratch initialized minGPT model
163
+ config = GPTConfig(**config_args)
164
+ model = GPT(config)
165
+ sd = model.state_dict()
166
+ sd_keys = sd.keys()
167
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
168
+
169
+ # init a huggingface/transformers model
170
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
171
+ sd_hf = model_hf.state_dict()
172
+
173
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
174
+ sd_keys_hf = sd_hf.keys()
175
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
176
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
177
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
178
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
179
+ # this means that we have to transpose these weights when we import them
180
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
181
+ for k in sd_keys_hf:
182
+ if any(k.endswith(w) for w in transposed):
183
+ # special treatment for the Conv1D weights we need to transpose
184
+ assert sd_hf[k].shape[::-1] == sd[k].shape
185
+ with torch.no_grad():
186
+ sd[k].copy_(sd_hf[k].t())
187
+ else:
188
+ # vanilla copy over the other parameters
189
+ assert sd_hf[k].shape == sd[k].shape
190
+ with torch.no_grad():
191
+ sd[k].copy_(sd_hf[k])
192
+
193
+ return model
194
+
195
+ # model = GPT.from_pretrained('gpt2')
196
+
197
+ device = 'cpu'
198
+ if torch.cuda.is_available():
199
+ device = 'cuda'
200
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
201
+ device = "mps"
202
+ print(f"using device: {device}")
203
+
204
+ # SEED
205
+ torch.manual_seed(1337)
206
+ if torch.cuda.is_available():
207
+ torch.cuda.manual_seed(1337)
208
+
209
+ # STOP
210
+ num_return_sequences = 5
211
+ max_length = 30
212
+
213
+
214
+
215
+ import tiktoken
216
+
217
+ class DataLoaderLite:
218
+ def __init__(self, B, T):
219
+ self.B = B
220
+ self.T = T
221
+
222
+ # at init load tokens from disk and store them in memory
223
+ with open('input.txt', 'r') as f:
224
+ text = f.read()
225
+ enc = tiktoken.get_encoding('gpt2')
226
+ tokens = enc.encode(text)
227
+ self.tokens = torch.tensor(tokens)
228
+ print(f'loaded {len(self.tokens)} tokens')
229
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
230
+
231
+ # state
232
+ self.current_position = 0
233
+
234
+ def next_batch(self):
235
+ B, T = self.B, self.T
236
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
237
+ x = (buf[:-1]).view(B, T) # inputs
238
+ y = (buf[1:]).view(B, T) # targets
239
+ # advance the position in the tensor
240
+ self.current_position += B*T
241
+ # if loading the next batch would be out of bounds, reset
242
+ if self.current_position + (B * T + 1) > len(self.tokens):
243
+ self.current_position = 0
244
+ return x, y
245
+
246
+ # CHANGES IN CURRENT CODE
247
+ torch.set_float32_matmul_precision('high')
248
+
249
+ model = GPT(GPTConfig())
250
+ model.to(device)
251
+ # model = torch.compile(model)
252
+
253
+ train_loader = DataLoaderLite(B = 16, T = 1024)
254
+
255
+ # NEW CODE
256
+ import time
257
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4, betas=(0.9, 0.95), eps=1e-8)
258
+ for i in range(50):
259
+ t0 = time.time()
260
+ x, y = train_loader.next_batch()
261
+ x, y = x.to(device), y.to(device)
262
+ optimizer.zero_grad()
263
+ # NEW CODE ADDED HERE
264
+ with torch.autocast(device_type=device, dtype=torch.bfloat16):
265
+ logits, loss = model(x, y)
266
+ loss.backward()
267
+ norm = torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
268
+ optimizer.step()
269
+ torch.cuda.synchronize()
270
+ t1 = time.time()
271
+ dt = (t1 - t0) * 1000
272
+ tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
273
+ print(f'step{i} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f} | norm: {norm:.2f}')
274
+
275
+
276
+ print(loss)
277
+ import sys; sys.exit(0)
278
+
279
+ torch.manual_seed(42)
280
+ torch.cuda.manual_seed(42)
281
+ while x.size(1) < max_length:
282
+ # forward the model to get the logits
283
+ with torch.no_grad():
284
+ logits = model(x)[0] # (B, T, vocab_size)
285
+ # take the logits at the last position
286
+ logits = logits[:, -1, :] # (B, vocab_size)
287
+ # get the probabilities
288
+ probs = F.softmax(logits, dim=-1)
289
+ # do top-k sampling of 50 (huggingface pipeline default)
290
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
291
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
292
+ # select a token from the top-k probabilities
293
+ # note: multinomial does not demand the input to sum to 1
294
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
295
+ # gather the corresponding indices
296
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
297
+ # append to the sequence
298
+ x = torch.cat((x, xcol), dim=1)
299
+
300
+ # print the generated text
301
+ for i in range(num_return_sequences):
302
+ tokens = x[i, :max_length].tolist()
303
+ decoded = enc.decode(tokens)
304
+ print(">", decoded)
CodeFiles/train_get2-9-speedup8.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-3 Paper
2
+ # add cosing delay
3
+ import os
4
+ import math
5
+ import time
6
+ import inspect
7
+ from dataclasses import dataclass
8
+ import torch
9
+ import torch.nn as nn
10
+ from torch.nn import functional as F
11
+
12
+
13
+ class CausalSelfAttention(nn.Module):
14
+
15
+ def __init__(self, config):
16
+ super().__init__()
17
+ assert config.n_embd % config.n_head == 0
18
+ # key, query, value projections for all heads, but in a batch
19
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
20
+ # output projection
21
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
22
+ self.c_proj.NANGPT_SCALE_INIT = 1
23
+ # regularization
24
+ self.n_head = config.n_head
25
+ self.n_embd = config.n_embd
26
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
27
+
28
+ def forward(self, x):
29
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
30
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
31
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
32
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
33
+ qkv = self.c_attn(x)
34
+ q, k, v = qkv.split(self.n_embd, dim=2)
35
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
37
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
38
+
39
+ # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
40
+ # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
41
+ # att = F.softmax(att, dim=-1)
42
+ # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
43
+
44
+ y = F.scaled_dot_product_attention(q, k, v, is_causal = True) # Flash attention
45
+
46
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
47
+ # output projection
48
+ y = self.c_proj(y)
49
+ return y
50
+
51
+
52
+ class MLP(nn.Module):
53
+
54
+ def __init__(self, config):
55
+ super().__init__()
56
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
57
+ self.gelu = nn.GELU(approximate='tanh')
58
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
59
+ self.c_proj.NANOGPT_SCALE_INIT = 1
60
+
61
+ def forward(self, x):
62
+ x = self.c_fc(x)
63
+ x = self.gelu(x)
64
+ x = self.c_proj(x)
65
+ return x
66
+
67
+ class Block(nn.Module):
68
+
69
+ def __init__(self, config):
70
+ super().__init__()
71
+ self.ln_1 = nn.LayerNorm(config.n_embd)
72
+ self.attn = CausalSelfAttention(config)
73
+ self.ln_2 = nn.LayerNorm(config.n_embd)
74
+ self.mlp = MLP(config)
75
+
76
+ def forward(self, x):
77
+ x = x + self.attn(self.ln_1(x))
78
+ x = x + self.mlp(self.ln_2(x))
79
+ return x
80
+
81
+
82
+ @dataclass
83
+ class GPTConfig:
84
+ block_size: int = 1024 # max sequence length
85
+ vocab_size: int = 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
86
+ n_layer: int = 12 # number of layers
87
+ n_head: int = 12 # number of heads
88
+ n_embd: int = 768 # embedding dimension
89
+
90
+
91
+ class GPT(nn.Module):
92
+
93
+ def __init__(self, config):
94
+ super().__init__()
95
+ self.config = config
96
+
97
+ self.transformer = nn.ModuleDict(dict(
98
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
99
+ wpe = nn.Embedding(config.block_size, config.n_embd),
100
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
101
+ ln_f = nn.LayerNorm(config.n_embd),
102
+ ))
103
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
104
+
105
+ # weight sharing
106
+ self.transformer.wte.weight = self.lm_head.weight
107
+
108
+ # weight initialization
109
+ self.apply(self._init_weights)
110
+
111
+ def _init_weights(self, module):
112
+ if isinstance(module, nn.Linear):
113
+ std = 0.02
114
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
115
+ std *= (2 * self.config.n_layer) ** -0.5
116
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
117
+ if module.bias is not None:
118
+ torch.nn.init.zeros_(module.bias)
119
+ elif isinstance(module, nn.Embedding):
120
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
121
+
122
+
123
+
124
+ def forward(self, idx, targets=None):
125
+ # idx is of shape (B, T)
126
+ B, T = idx.size()
127
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
128
+ # forward the token and posisition embeddings
129
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
130
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
131
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
132
+ x = tok_emb + pos_emb
133
+ # forward the blocks of the transformer
134
+ for block in self.transformer.h:
135
+ x = block(x)
136
+ # forward the final layernorm and the classifier
137
+ x = self.transformer.ln_f(x)
138
+ logits = self.lm_head(x) # (B, T, vocab_size)
139
+ loss = None
140
+ if targets is not None:
141
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
142
+ return logits, loss
143
+
144
+ @classmethod
145
+ def from_pretrained(cls, model_type):
146
+ """Loads pretrained GPT-2 model weights from huggingface"""
147
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
148
+ from transformers import GPT2LMHeadModel
149
+ print("loading weights from pretrained gpt: %s" % model_type)
150
+
151
+ # n_layer, n_head and n_embd are determined from model_type
152
+ config_args = {
153
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
154
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
155
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
156
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
157
+ }[model_type]
158
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
159
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
160
+ # create a from-scratch initialized minGPT model
161
+ config = GPTConfig(**config_args)
162
+ model = GPT(config)
163
+ sd = model.state_dict()
164
+ sd_keys = sd.keys()
165
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
166
+
167
+ # init a huggingface/transformers model
168
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
169
+ sd_hf = model_hf.state_dict()
170
+
171
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
172
+ sd_keys_hf = sd_hf.keys()
173
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
174
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
175
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
176
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
177
+ # this means that we have to transpose these weights when we import them
178
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
179
+ for k in sd_keys_hf:
180
+ if any(k.endswith(w) for w in transposed):
181
+ # special treatment for the Conv1D weights we need to transpose
182
+ assert sd_hf[k].shape[::-1] == sd[k].shape
183
+ with torch.no_grad():
184
+ sd[k].copy_(sd_hf[k].t())
185
+ else:
186
+ # vanilla copy over the other parameters
187
+ assert sd_hf[k].shape == sd[k].shape
188
+ with torch.no_grad():
189
+ sd[k].copy_(sd_hf[k])
190
+
191
+ return model
192
+
193
+ # model = GPT.from_pretrained('gpt2')
194
+
195
+ device = 'cpu'
196
+ if torch.cuda.is_available():
197
+ device = 'cuda'
198
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
199
+ device = "mps"
200
+ print(f"using device: {device}")
201
+
202
+ # SEED
203
+ torch.manual_seed(1337)
204
+ if torch.cuda.is_available():
205
+ torch.cuda.manual_seed(1337)
206
+
207
+ # STOP
208
+ num_return_sequences = 5
209
+ max_length = 30
210
+
211
+
212
+
213
+ import tiktoken
214
+
215
+ class DataLoaderLite:
216
+ def __init__(self, B, T):
217
+ self.B = B
218
+ self.T = T
219
+
220
+ # at init load tokens from disk and store them in memory
221
+ with open('input.txt', 'r') as f:
222
+ text = f.read()
223
+ enc = tiktoken.get_encoding('gpt2')
224
+ tokens = enc.encode(text)
225
+ self.tokens = torch.tensor(tokens)
226
+ print(f'loaded {len(self.tokens)} tokens')
227
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
228
+
229
+ # state
230
+ self.current_position = 0
231
+
232
+ def next_batch(self):
233
+ B, T = self.B, self.T
234
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
235
+ x = (buf[:-1]).view(B, T) # inputs
236
+ y = (buf[1:]).view(B, T) # targets
237
+ # advance the position in the tensor
238
+ self.current_position += B*T
239
+ # if loading the next batch would be out of bounds, reset
240
+ if self.current_position + (B * T + 1) > len(self.tokens):
241
+ self.current_position = 0
242
+ return x, y
243
+
244
+ # CHANGES IN CURRENT CODE
245
+ torch.set_float32_matmul_precision('high')
246
+ model = GPT(GPTConfig())
247
+ model.to(device)
248
+ # model = torch.compile(model)
249
+
250
+ # CODE UPDATE HERE
251
+ max_lr = 6e-4
252
+ min_lr = max_lr * 0.1
253
+ warmup_steps = 10
254
+ max_steps = 50
255
+
256
+ def get_lr(it):
257
+ if it < warmup_steps:
258
+ return max_lr * (it + 1) / warmup_steps
259
+ if it > max_steps:
260
+ return min_lr
261
+ decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
262
+ assert 0 <= decay_ratio <=1
263
+ coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
264
+ return min_lr + coeff * (max_lr - min_lr)
265
+
266
+ train_loader = DataLoaderLite(B = 16, T = 1024)
267
+
268
+ # NEW CODE
269
+ import time
270
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4, betas=(0.9, 0.95), eps=1e-8)
271
+ for step in range(50):
272
+ t0 = time.time()
273
+ x, y = train_loader.next_batch()
274
+ x, y = x.to(device), y.to(device)
275
+ optimizer.zero_grad()
276
+ # NEW CODE ADDED HERE
277
+ with torch.autocast(device_type=device, dtype=torch.bfloat16):
278
+ logits, loss = model(x, y)
279
+ loss.backward()
280
+ norm = torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
281
+ # NEW CODE
282
+ lr = get_lr(step)
283
+ for param_group in optimizer.param_groups:
284
+ param_group['lr'] = lr
285
+
286
+ optimizer.step()
287
+ torch.cuda.synchronize()
288
+ t1 = time.time()
289
+ dt = (t1 - t0) * 1000
290
+ tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
291
+ print(f'step{step} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f} | norm: {norm:.2f}')
292
+
293
+
294
+ print(loss)
295
+ import sys; sys.exit(0)
296
+
297
+ torch.manual_seed(42)
298
+ torch.cuda.manual_seed(42)
299
+ while x.size(1) < max_length:
300
+ # forward the model to get the logits
301
+ with torch.no_grad():
302
+ logits = model(x)[0] # (B, T, vocab_size)
303
+ # take the logits at the last position
304
+ logits = logits[:, -1, :] # (B, vocab_size)
305
+ # get the probabilities
306
+ probs = F.softmax(logits, dim=-1)
307
+ # do top-k sampling of 50 (huggingface pipeline default)
308
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
309
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
310
+ # select a token from the top-k probabilities
311
+ # note: multinomial does not demand the input to sum to 1
312
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
313
+ # gather the corresponding indices
314
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
315
+ # append to the sequence
316
+ x = torch.cat((x, xcol), dim=1)
317
+
318
+ # print the generated text
319
+ for i in range(num_return_sequences):
320
+ tokens = x[i, :max_length].tolist()
321
+ decoded = enc.decode(tokens)
322
+ print(">", decoded)
CodeFiles/train_get2-9-speedup9.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-3 Paper
2
+ # add cosing delay
3
+ import os
4
+ import math
5
+ import time
6
+ import inspect
7
+ from dataclasses import dataclass
8
+ import torch
9
+ import torch.nn as nn
10
+ from torch.nn import functional as F
11
+
12
+
13
+ class CausalSelfAttention(nn.Module):
14
+
15
+ def __init__(self, config):
16
+ super().__init__()
17
+ assert config.n_embd % config.n_head == 0
18
+ # key, query, value projections for all heads, but in a batch
19
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
20
+ # output projection
21
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
22
+ self.c_proj.NANGPT_SCALE_INIT = 1
23
+ # regularization
24
+ self.n_head = config.n_head
25
+ self.n_embd = config.n_embd
26
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
27
+
28
+ def forward(self, x):
29
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
30
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
31
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
32
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
33
+ qkv = self.c_attn(x)
34
+ q, k, v = qkv.split(self.n_embd, dim=2)
35
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
37
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
38
+
39
+ # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
40
+ # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
41
+ # att = F.softmax(att, dim=-1)
42
+ # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
43
+
44
+ y = F.scaled_dot_product_attention(q, k, v, is_causal = True) # Flash attention
45
+
46
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
47
+ # output projection
48
+ y = self.c_proj(y)
49
+ return y
50
+
51
+
52
+ class MLP(nn.Module):
53
+
54
+ def __init__(self, config):
55
+ super().__init__()
56
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
57
+ self.gelu = nn.GELU(approximate='tanh')
58
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
59
+ self.c_proj.NANOGPT_SCALE_INIT = 1
60
+
61
+ def forward(self, x):
62
+ x = self.c_fc(x)
63
+ x = self.gelu(x)
64
+ x = self.c_proj(x)
65
+ return x
66
+
67
+ class Block(nn.Module):
68
+
69
+ def __init__(self, config):
70
+ super().__init__()
71
+ self.ln_1 = nn.LayerNorm(config.n_embd)
72
+ self.attn = CausalSelfAttention(config)
73
+ self.ln_2 = nn.LayerNorm(config.n_embd)
74
+ self.mlp = MLP(config)
75
+
76
+ def forward(self, x):
77
+ x = x + self.attn(self.ln_1(x))
78
+ x = x + self.mlp(self.ln_2(x))
79
+ return x
80
+
81
+
82
+ @dataclass
83
+ class GPTConfig:
84
+ block_size: int = 1024 # max sequence length
85
+ vocab_size: int = 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
86
+ n_layer: int = 12 # number of layers
87
+ n_head: int = 12 # number of heads
88
+ n_embd: int = 768 # embedding dimension
89
+
90
+
91
+ class GPT(nn.Module):
92
+
93
+ def __init__(self, config):
94
+ super().__init__()
95
+ self.config = config
96
+
97
+ self.transformer = nn.ModuleDict(dict(
98
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
99
+ wpe = nn.Embedding(config.block_size, config.n_embd),
100
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
101
+ ln_f = nn.LayerNorm(config.n_embd),
102
+ ))
103
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
104
+
105
+ # weight sharing
106
+ self.transformer.wte.weight = self.lm_head.weight
107
+
108
+ # weight initialization
109
+ self.apply(self._init_weights)
110
+
111
+ def _init_weights(self, module):
112
+ if isinstance(module, nn.Linear):
113
+ std = 0.02
114
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
115
+ std *= (2 * self.config.n_layer) ** -0.5
116
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
117
+ if module.bias is not None:
118
+ torch.nn.init.zeros_(module.bias)
119
+ elif isinstance(module, nn.Embedding):
120
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
121
+
122
+
123
+
124
+ def forward(self, idx, targets=None):
125
+ # idx is of shape (B, T)
126
+ B, T = idx.size()
127
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
128
+ # forward the token and posisition embeddings
129
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
130
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
131
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
132
+ x = tok_emb + pos_emb
133
+ # forward the blocks of the transformer
134
+ for block in self.transformer.h:
135
+ x = block(x)
136
+ # forward the final layernorm and the classifier
137
+ x = self.transformer.ln_f(x)
138
+ logits = self.lm_head(x) # (B, T, vocab_size)
139
+ loss = None
140
+ if targets is not None:
141
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
142
+ return logits, loss
143
+
144
+ @classmethod
145
+ def from_pretrained(cls, model_type):
146
+ """Loads pretrained GPT-2 model weights from huggingface"""
147
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
148
+ from transformers import GPT2LMHeadModel
149
+ print("loading weights from pretrained gpt: %s" % model_type)
150
+
151
+ # n_layer, n_head and n_embd are determined from model_type
152
+ config_args = {
153
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
154
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
155
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
156
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
157
+ }[model_type]
158
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
159
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
160
+ # create a from-scratch initialized minGPT model
161
+ config = GPTConfig(**config_args)
162
+ model = GPT(config)
163
+ sd = model.state_dict()
164
+ sd_keys = sd.keys()
165
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
166
+
167
+ # init a huggingface/transformers model
168
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
169
+ sd_hf = model_hf.state_dict()
170
+
171
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
172
+ sd_keys_hf = sd_hf.keys()
173
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
174
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
175
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
176
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
177
+ # this means that we have to transpose these weights when we import them
178
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
179
+ for k in sd_keys_hf:
180
+ if any(k.endswith(w) for w in transposed):
181
+ # special treatment for the Conv1D weights we need to transpose
182
+ assert sd_hf[k].shape[::-1] == sd[k].shape
183
+ with torch.no_grad():
184
+ sd[k].copy_(sd_hf[k].t())
185
+ else:
186
+ # vanilla copy over the other parameters
187
+ assert sd_hf[k].shape == sd[k].shape
188
+ with torch.no_grad():
189
+ sd[k].copy_(sd_hf[k])
190
+
191
+ return model
192
+
193
+ def configure_optimizers(self, weight_decay, learning_rate, device_type):
194
+ # start with all of the candidate parameters (that require grad)
195
+ param_dict = {pn: p for pn, p in self.named_parameters()}
196
+ param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
197
+ # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
198
+ # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
199
+ decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
200
+ nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
201
+ optim_groups = [
202
+ {'params': decay_params, 'weight_decay': weight_decay},
203
+ {'params': nodecay_params, 'weight_decay': 0.0}
204
+ ]
205
+ num_decay_params = sum(p.numel() for p in decay_params)
206
+ num_nodecay_params = sum(p.numel() for p in nodecay_params)
207
+
208
+ print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
209
+ print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
210
+ # Create AdamW optimizer and use the fused version if it is available
211
+ fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
212
+ use_fused = fused_available and device_type == "cuda"
213
+
214
+ print(f"using fused AdamW: {use_fused}")
215
+ optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
216
+ return optimizer
217
+
218
+ # model = GPT.from_pretrained('gpt2')
219
+
220
+ device = 'cpu'
221
+ if torch.cuda.is_available():
222
+ device = 'cuda'
223
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
224
+ device = "mps"
225
+ print(f"using device: {device}")
226
+
227
+ # SEED
228
+ torch.manual_seed(1337)
229
+ if torch.cuda.is_available():
230
+ torch.cuda.manual_seed(1337)
231
+
232
+ # STOP
233
+ num_return_sequences = 5
234
+ max_length = 30
235
+
236
+
237
+
238
+ import tiktoken
239
+ import os
240
+ os.environ['TIKTOKEN_CACHE_DIR'] = '/raid/users/mohammadibrahim-st/TSAI/Assignment21/tmp'
241
+ class DataLoaderLite:
242
+ def __init__(self, B, T):
243
+ self.B = B
244
+ self.T = T
245
+
246
+ # at init load tokens from disk and store them in memory
247
+ with open('/raid/users/mohammadibrahim-st/TSAI/Assignment21/input.txt', 'r') as f:
248
+ text = f.read()
249
+ enc = tiktoken.get_encoding('gpt2')
250
+ tokens = enc.encode(text)
251
+ self.tokens = torch.tensor(tokens)
252
+ print(f'loaded {len(self.tokens)} tokens')
253
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
254
+
255
+ # state
256
+ self.current_position = 0
257
+
258
+ def next_batch(self):
259
+ B, T = self.B, self.T
260
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
261
+ x = (buf[:-1]).view(B, T) # inputs
262
+ y = (buf[1:]).view(B, T) # targets
263
+ # advance the position in the tensor
264
+ self.current_position += B*T
265
+ # if loading the next batch would be out of bounds, reset
266
+ if self.current_position + (B * T + 1) > len(self.tokens):
267
+ self.current_position = 0
268
+ return x, y
269
+
270
+ # CHANGES IN CURRENT CODE
271
+ torch.set_float32_matmul_precision('high')
272
+ model = GPT(GPTConfig())
273
+ model.to(device)
274
+ # model = torch.compile(model)
275
+
276
+ # CODE UPDATE HERE
277
+ max_lr = 6e-4
278
+ min_lr = max_lr * 0.1
279
+ warmup_steps = 10
280
+ max_steps = 5000
281
+
282
+ def get_lr(it):
283
+ if it < warmup_steps:
284
+ return max_lr * (it + 1) / warmup_steps
285
+ if it > max_steps:
286
+ return min_lr
287
+ decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
288
+ assert 0 <= decay_ratio <=1
289
+ coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
290
+ return min_lr + coeff * (max_lr - min_lr)
291
+
292
+ train_loader = DataLoaderLite(B = 16, T = 1024)
293
+
294
+ # NEW CODE
295
+ import time
296
+ # optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4, betas=(0.9, 0.95), eps=1e-8)
297
+ optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device)
298
+ for step in range(max_steps):
299
+ t0 = time.time()
300
+ x, y = train_loader.next_batch()
301
+ x, y = x.to(device), y.to(device)
302
+ optimizer.zero_grad()
303
+ # NEW CODE ADDED HERE
304
+ with torch.autocast(device_type=device, dtype=torch.bfloat16):
305
+ logits, loss = model(x, y)
306
+ loss.backward()
307
+ norm = torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
308
+ # NEW CODE
309
+ lr = get_lr(step)
310
+ for param_group in optimizer.param_groups:
311
+ param_group['lr'] = lr
312
+
313
+ optimizer.step()
314
+ torch.cuda.synchronize()
315
+ t1 = time.time()
316
+ dt = (t1 - t0) * 1000
317
+ tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)
318
+ print(f'step{step} | loss: {loss.item()} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec: .2f} | norm: {norm:.2f}')
319
+
320
+
321
+ print(loss)
322
+ model_save_path = '/raid/users/mohammadibrahim-st/TSAI/Assignment21/model5k.pt'
323
+ torch.save(model.state_dict(), model_save_path)
324
+ print(f'Trained model saved at: {model_save_path}')
325
+ import sys; sys.exit(0)
326
+
327
+ torch.manual_seed(42)
328
+ torch.cuda.manual_seed(42)
329
+ while x.size(1) < max_length:
330
+ # forward the model to get the logits
331
+ with torch.no_grad():
332
+ logits = model(x)[0] # (B, T, vocab_size)
333
+ # take the logits at the last position
334
+ logits = logits[:, -1, :] # (B, vocab_size)
335
+ # get the probabilities
336
+ probs = F.softmax(logits, dim=-1)
337
+ # do top-k sampling of 50 (huggingface pipeline default)
338
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
339
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
340
+ # select a token from the top-k probabilities
341
+ # note: multinomial does not demand the input to sum to 1
342
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
343
+ # gather the corresponding indices
344
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
345
+ # append to the sequence
346
+ x = torch.cat((x, xcol), dim=1)
347
+
348
+ # print the generated text
349
+ for i in range(num_return_sequences):
350
+ tokens = x[i, :max_length].tolist()
351
+ decoded = enc.decode(tokens)
352
+ print(">", decoded)
app.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.nn import functional as F
4
+ import tiktoken
5
+ import os
6
+ import math
7
+ import time
8
+ import gradio as gr
9
+ import inspect
10
+ from dataclasses import dataclass
11
+ import torch
12
+ import torch.nn as nn
13
+ from torch.nn import functional as F
14
+ import os
15
+ # os.environ['TIKTOKEN_CACHE_DIR'] = '/raid/users/mohammadibrahim-st/TSAI/Assignment21/tmp'
16
+ class CausalSelfAttention(nn.Module):
17
+
18
+ def __init__(self, config):
19
+ super().__init__()
20
+ assert config.n_embd % config.n_head == 0
21
+ # key, query, value projections for all heads, but in a batch
22
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
23
+ # output projection
24
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
25
+ self.c_proj.NANGPT_SCALE_INIT = 1
26
+ # regularization
27
+ self.n_head = config.n_head
28
+ self.n_embd = config.n_embd
29
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
30
+
31
+ def forward(self, x):
32
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
33
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
34
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
35
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
36
+ qkv = self.c_attn(x)
37
+ q, k, v = qkv.split(self.n_embd, dim=2)
38
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
39
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
40
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
41
+
42
+ # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
43
+ # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
44
+ # att = F.softmax(att, dim=-1)
45
+ # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
46
+
47
+ y = F.scaled_dot_product_attention(q, k, v, is_causal = True) # Flash attention
48
+
49
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
50
+ # output projection
51
+ y = self.c_proj(y)
52
+ return y
53
+
54
+
55
+ class MLP(nn.Module):
56
+
57
+ def __init__(self, config):
58
+ super().__init__()
59
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
60
+ self.gelu = nn.GELU(approximate='tanh')
61
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
62
+ self.c_proj.NANOGPT_SCALE_INIT = 1
63
+
64
+ def forward(self, x):
65
+ x = self.c_fc(x)
66
+ x = self.gelu(x)
67
+ x = self.c_proj(x)
68
+ return x
69
+
70
+ class Block(nn.Module):
71
+
72
+ def __init__(self, config):
73
+ super().__init__()
74
+ self.ln_1 = nn.LayerNorm(config.n_embd)
75
+ self.attn = CausalSelfAttention(config)
76
+ self.ln_2 = nn.LayerNorm(config.n_embd)
77
+ self.mlp = MLP(config)
78
+
79
+ def forward(self, x):
80
+ x = x + self.attn(self.ln_1(x))
81
+ x = x + self.mlp(self.ln_2(x))
82
+ return x
83
+
84
+ @dataclass
85
+ class GPTConfig:
86
+ block_size: int = 1024 # max sequence length
87
+ vocab_size: int = 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
88
+ n_layer: int = 12 # number of layers
89
+ n_head: int = 12 # number of heads
90
+ n_embd: int = 768 # embedding dimension
91
+
92
+
93
+ class GPT(nn.Module):
94
+
95
+ def __init__(self, config):
96
+ super().__init__()
97
+ self.config = config
98
+
99
+ self.transformer = nn.ModuleDict(dict(
100
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
101
+ wpe = nn.Embedding(config.block_size, config.n_embd),
102
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
103
+ ln_f = nn.LayerNorm(config.n_embd),
104
+ ))
105
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
106
+
107
+ # weight sharing
108
+ self.transformer.wte.weight = self.lm_head.weight
109
+
110
+ # weight initialization
111
+ self.apply(self._init_weights)
112
+
113
+ def _init_weights(self, module):
114
+ if isinstance(module, nn.Linear):
115
+ std = 0.02
116
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
117
+ std *= (2 * self.config.n_layer) ** -0.5
118
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
119
+ if module.bias is not None:
120
+ torch.nn.init.zeros_(module.bias)
121
+ elif isinstance(module, nn.Embedding):
122
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
123
+
124
+
125
+
126
+ def forward(self, idx, targets=None):
127
+ # idx is of shape (B, T)
128
+ B, T = idx.size()
129
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
130
+ # forward the token and posisition embeddings
131
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
132
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
133
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
134
+ x = tok_emb + pos_emb
135
+ # forward the blocks of the transformer
136
+ for block in self.transformer.h:
137
+ x = block(x)
138
+ # forward the final layernorm and the classifier
139
+ x = self.transformer.ln_f(x)
140
+ logits = self.lm_head(x) # (B, T, vocab_size)
141
+ loss = None
142
+ if targets is not None:
143
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
144
+ return logits, loss
145
+
146
+ @classmethod
147
+ def from_pretrained(cls, model_type):
148
+ """Loads pretrained GPT-2 model weights from huggingface"""
149
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
150
+ from transformers import GPT2LMHeadModel
151
+ print("loading weights from pretrained gpt: %s" % model_type)
152
+
153
+ # n_layer, n_head and n_embd are determined from model_type
154
+ config_args = {
155
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
156
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
157
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
158
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
159
+ }[model_type]
160
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
161
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
162
+ # create a from-scratch initialized minGPT model
163
+ config = GPTConfig(**config_args)
164
+ model = GPT(config)
165
+ sd = model.state_dict()
166
+ sd_keys = sd.keys()
167
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
168
+
169
+ # init a huggingface/transformers model
170
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
171
+ sd_hf = model_hf.state_dict()
172
+
173
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
174
+ sd_keys_hf = sd_hf.keys()
175
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
176
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
177
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
178
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
179
+ # this means that we have to transpose these weights when we import them
180
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
181
+ for k in sd_keys_hf:
182
+ if any(k.endswith(w) for w in transposed):
183
+ # special treatment for the Conv1D weights we need to transpose
184
+ assert sd_hf[k].shape[::-1] == sd[k].shape
185
+ with torch.no_grad():
186
+ sd[k].copy_(sd_hf[k].t())
187
+ else:
188
+ # vanilla copy over the other parameters
189
+ assert sd_hf[k].shape == sd[k].shape
190
+ with torch.no_grad():
191
+ sd[k].copy_(sd_hf[k])
192
+
193
+ return model
194
+
195
+ def configure_optimizers(self, weight_decay, learning_rate, device_type):
196
+ # start with all of the candidate parameters (that require grad)
197
+ param_dict = {pn: p for pn, p in self.named_parameters()}
198
+ param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
199
+ # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
200
+ # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
201
+ decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
202
+ nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
203
+ optim_groups = [
204
+ {'params': decay_params, 'weight_decay': weight_decay},
205
+ {'params': nodecay_params, 'weight_decay': 0.0}
206
+ ]
207
+ num_decay_params = sum(p.numel() for p in decay_params)
208
+ num_nodecay_params = sum(p.numel() for p in nodecay_params)
209
+
210
+ print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
211
+ print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
212
+ # Create AdamW optimizer and use the fused version if it is available
213
+ fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
214
+ use_fused = fused_available and device_type == "cuda"
215
+
216
+ print(f"using fused AdamW: {use_fused}")
217
+ optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
218
+ return optimizer
219
+
220
+
221
+ # Set the device
222
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
223
+ num_return_sequences = 5
224
+ max_length = 30
225
+ # Load the trained model
226
+ import os
227
+ current_directory = os.path.dirname(os.path.abspath(__file__))
228
+
229
+ # Set the model path to the same directory as the Python file
230
+ model_save_path = os.path.join(current_directory, 'model5k.pt')
231
+ model = GPT(GPTConfig())
232
+ model.load_state_dict(torch.load(model_save_path))
233
+ model.to(device)
234
+ model.eval()
235
+
236
+ # Tokenizer
237
+ enc = tiktoken.get_encoding('gpt2')
238
+ def generate_text(user_prompt):
239
+ num_return_sequences = 5
240
+ max_length = 30
241
+
242
+ # Tokenize input prompt
243
+ tokens = enc.encode(user_prompt)
244
+ tokens = torch.tensor(tokens, dtype=torch.long)
245
+ tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # Repeat for each sequence
246
+ x = tokens.to(device)
247
+
248
+ # Fix seeds for reproducibility
249
+ torch.manual_seed(42)
250
+ torch.cuda.manual_seed(42)
251
+
252
+ # Generate sequences until max_length
253
+ while x.size(1) < max_length:
254
+ with torch.no_grad():
255
+ logits = model(x)[0] # Get logits
256
+ logits = logits[:, -1, :] # Take the logits at the last position
257
+ probs = F.softmax(logits, dim=-1) # Get the probabilities
258
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1) # Top-k sampling
259
+ ix = torch.multinomial(topk_probs, 1) # Select a token
260
+ xcol = torch.gather(topk_indices, -1, ix) # Gather the corresponding indices
261
+ x = torch.cat((x, xcol), dim=1) # Append the selected token to the sequence
262
+
263
+ # Decode and return generated sequences
264
+ generated_texts = []
265
+ for i in range(num_return_sequences):
266
+ tokens = x[i, :max_length].tolist()
267
+ decoded = enc.decode(tokens)
268
+ generated_texts.append(decoded)
269
+
270
+ return "\n\n".join(generated_texts)
271
+
272
+ # Create Gradio interface
273
+ iface = gr.Interface(fn=generate_text,
274
+ inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
275
+ outputs="text",
276
+ title="GPT Text Generator",
277
+ description="Generate text using your trained GPT model. Enter a prompt and see what the model generates.")
278
+
279
+ # Launch the Gradio app
280
+ iface.launch()
infer.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.nn import functional as F
4
+ import tiktoken
5
+ import os
6
+ import math
7
+ import time
8
+ import inspect
9
+ from dataclasses import dataclass
10
+ import torch
11
+ import torch.nn as nn
12
+ from torch.nn import functional as F
13
+ import os
14
+ os.environ['TIKTOKEN_CACHE_DIR'] = '/raid/users/mohammadibrahim-st/TSAI/Assignment21/tmp'
15
+ class CausalSelfAttention(nn.Module):
16
+
17
+ def __init__(self, config):
18
+ super().__init__()
19
+ assert config.n_embd % config.n_head == 0
20
+ # key, query, value projections for all heads, but in a batch
21
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
22
+ # output projection
23
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
24
+ self.c_proj.NANGPT_SCALE_INIT = 1
25
+ # regularization
26
+ self.n_head = config.n_head
27
+ self.n_embd = config.n_embd
28
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
29
+
30
+ def forward(self, x):
31
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
32
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
33
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
34
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
35
+ qkv = self.c_attn(x)
36
+ q, k, v = qkv.split(self.n_embd, dim=2)
37
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
38
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
39
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
40
+
41
+ # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
42
+ # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
43
+ # att = F.softmax(att, dim=-1)
44
+ # y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
45
+
46
+ y = F.scaled_dot_product_attention(q, k, v, is_causal = True) # Flash attention
47
+
48
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
49
+ # output projection
50
+ y = self.c_proj(y)
51
+ return y
52
+
53
+
54
+ class MLP(nn.Module):
55
+
56
+ def __init__(self, config):
57
+ super().__init__()
58
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
59
+ self.gelu = nn.GELU(approximate='tanh')
60
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
61
+ self.c_proj.NANOGPT_SCALE_INIT = 1
62
+
63
+ def forward(self, x):
64
+ x = self.c_fc(x)
65
+ x = self.gelu(x)
66
+ x = self.c_proj(x)
67
+ return x
68
+
69
+ class Block(nn.Module):
70
+
71
+ def __init__(self, config):
72
+ super().__init__()
73
+ self.ln_1 = nn.LayerNorm(config.n_embd)
74
+ self.attn = CausalSelfAttention(config)
75
+ self.ln_2 = nn.LayerNorm(config.n_embd)
76
+ self.mlp = MLP(config)
77
+
78
+ def forward(self, x):
79
+ x = x + self.attn(self.ln_1(x))
80
+ x = x + self.mlp(self.ln_2(x))
81
+ return x
82
+
83
+ @dataclass
84
+ class GPTConfig:
85
+ block_size: int = 1024 # max sequence length
86
+ vocab_size: int = 50304 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
87
+ n_layer: int = 12 # number of layers
88
+ n_head: int = 12 # number of heads
89
+ n_embd: int = 768 # embedding dimension
90
+
91
+
92
+ class GPT(nn.Module):
93
+
94
+ def __init__(self, config):
95
+ super().__init__()
96
+ self.config = config
97
+
98
+ self.transformer = nn.ModuleDict(dict(
99
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
100
+ wpe = nn.Embedding(config.block_size, config.n_embd),
101
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
102
+ ln_f = nn.LayerNorm(config.n_embd),
103
+ ))
104
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
105
+
106
+ # weight sharing
107
+ self.transformer.wte.weight = self.lm_head.weight
108
+
109
+ # weight initialization
110
+ self.apply(self._init_weights)
111
+
112
+ def _init_weights(self, module):
113
+ if isinstance(module, nn.Linear):
114
+ std = 0.02
115
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
116
+ std *= (2 * self.config.n_layer) ** -0.5
117
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
118
+ if module.bias is not None:
119
+ torch.nn.init.zeros_(module.bias)
120
+ elif isinstance(module, nn.Embedding):
121
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
122
+
123
+
124
+
125
+ def forward(self, idx, targets=None):
126
+ # idx is of shape (B, T)
127
+ B, T = idx.size()
128
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
129
+ # forward the token and posisition embeddings
130
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
131
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
132
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
133
+ x = tok_emb + pos_emb
134
+ # forward the blocks of the transformer
135
+ for block in self.transformer.h:
136
+ x = block(x)
137
+ # forward the final layernorm and the classifier
138
+ x = self.transformer.ln_f(x)
139
+ logits = self.lm_head(x) # (B, T, vocab_size)
140
+ loss = None
141
+ if targets is not None:
142
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
143
+ return logits, loss
144
+
145
+ @classmethod
146
+ def from_pretrained(cls, model_type):
147
+ """Loads pretrained GPT-2 model weights from huggingface"""
148
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
149
+ from transformers import GPT2LMHeadModel
150
+ print("loading weights from pretrained gpt: %s" % model_type)
151
+
152
+ # n_layer, n_head and n_embd are determined from model_type
153
+ config_args = {
154
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
155
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
156
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
157
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
158
+ }[model_type]
159
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
160
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
161
+ # create a from-scratch initialized minGPT model
162
+ config = GPTConfig(**config_args)
163
+ model = GPT(config)
164
+ sd = model.state_dict()
165
+ sd_keys = sd.keys()
166
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
167
+
168
+ # init a huggingface/transformers model
169
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
170
+ sd_hf = model_hf.state_dict()
171
+
172
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
173
+ sd_keys_hf = sd_hf.keys()
174
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
175
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
176
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
177
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
178
+ # this means that we have to transpose these weights when we import them
179
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
180
+ for k in sd_keys_hf:
181
+ if any(k.endswith(w) for w in transposed):
182
+ # special treatment for the Conv1D weights we need to transpose
183
+ assert sd_hf[k].shape[::-1] == sd[k].shape
184
+ with torch.no_grad():
185
+ sd[k].copy_(sd_hf[k].t())
186
+ else:
187
+ # vanilla copy over the other parameters
188
+ assert sd_hf[k].shape == sd[k].shape
189
+ with torch.no_grad():
190
+ sd[k].copy_(sd_hf[k])
191
+
192
+ return model
193
+
194
+ def configure_optimizers(self, weight_decay, learning_rate, device_type):
195
+ # start with all of the candidate parameters (that require grad)
196
+ param_dict = {pn: p for pn, p in self.named_parameters()}
197
+ param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
198
+ # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
199
+ # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
200
+ decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
201
+ nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
202
+ optim_groups = [
203
+ {'params': decay_params, 'weight_decay': weight_decay},
204
+ {'params': nodecay_params, 'weight_decay': 0.0}
205
+ ]
206
+ num_decay_params = sum(p.numel() for p in decay_params)
207
+ num_nodecay_params = sum(p.numel() for p in nodecay_params)
208
+
209
+ print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
210
+ print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
211
+ # Create AdamW optimizer and use the fused version if it is available
212
+ fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
213
+ use_fused = fused_available and device_type == "cuda"
214
+
215
+ print(f"using fused AdamW: {use_fused}")
216
+ optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
217
+ return optimizer
218
+
219
+
220
+ # Set the device
221
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
222
+ num_return_sequences = 5
223
+ max_length = 30
224
+ # Load the trained model
225
+ import os
226
+ # Set the model save path to the current directory
227
+ model_save_path = os.path.join(os.getcwd(), 'model5k.pt')
228
+ model = GPT(GPTConfig())
229
+ model.load_state_dict(torch.load(model_save_path))
230
+ model.to(device)
231
+ model.eval()
232
+
233
+ # Tokenizer
234
+ enc = tiktoken.get_encoding('gpt2')
235
+ tokens = enc.encode("Hello, I'm a language model,")
236
+ tokens = torch.tensor(tokens, dtype= torch.long) # (8,) #check tiktoken app
237
+ tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8)
238
+ x = tokens.to('cuda')
239
+
240
+ torch.manual_seed(42)
241
+ torch.cuda.manual_seed(42)
242
+ while x.size(1) < max_length:
243
+ # forward the model to get the logits
244
+ with torch.no_grad():
245
+ logits = model(x)[0] # (B, T, vocab_size)
246
+ # take the logits at the last position
247
+ logits = logits[:, -1, :] # (B, vocab_size)
248
+ # get the probabilities
249
+ probs = F.softmax(logits, dim=-1)
250
+ # do top-k sampling of 50 (huggingface pipeline default)
251
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
252
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
253
+ # select a token from the top-k probabilities
254
+ # note: multinomial does not demand the input to sum to 1
255
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
256
+ # gather the corresponding indices
257
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
258
+ # append to the sequence
259
+ x = torch.cat((x, xcol), dim=1)
260
+
261
+ # print the generated text
262
+ for i in range(num_return_sequences):
263
+ tokens = x[i, :max_length].tolist()
264
+ decoded = enc.decode(tokens)
265
+ print(">", decoded)
input.txt ADDED
The diff for this file is too large to render. See raw diff
 
model5k.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2f929c6742de81974c6bfd22a15941e247c04803f09be17fcab94806620779d
3
+ size 548292146
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch
2
+ tiktoken
tmp/6c7ea1a7e38e3a7f062df639a5b80947f075ffe6 ADDED
The diff for this file is too large to render. See raw diff
 
tmp/6d1cbeee0f20b3d9449abfede4726ed8212e3aee ADDED
The diff for this file is too large to render. See raw diff