import re from collections import defaultdict def train_bpe(text, num_merges): # Split into words and characters words = [list(word) + ['</w>'] for word in text.split()] # ... (full BPE algorithm here) return merges, vocab
| Symptom | Likely Cause | Solution | |---------|--------------|----------| | Loss not decreasing | Learning rate too high/low | Use a sweep (3e-4 for AdamW) | | Loss is NaN | Exploding gradients | Clip gradients or lower LR | | Model repeats gibberish | Too small hidden dimensions | Increase embed size (e.g., 128→384) | | Training takes weeks | No data parallelism | Use DistributedDataParallel | build large language model from scratch pdf
class TransformerBlock(nn.Module): def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1): super().__init__() self.attention = MultiHeadAttention(embed_dim, num_heads) self.feed_forward = nn.Sequential( nn.Linear(embed_dim, ff_dim), nn.ReLU(), nn.Linear(ff_dim, embed_dim) ) self.ln1 = nn.LayerNorm(embed_dim) self.ln2 = nn.LayerNorm(embed_dim) self.dropout = nn.Dropout(dropout) def forward(self, x, mask=None): # Attention with residual attn_out = self.attention(x, x, x, mask) x = self.ln1(x + self.dropout(attn_out)) # Feed-forward with residual ff_out = self.feed_forward(x) x = self.ln2(x + self.dropout(ff_out)) return x Every serious “build from scratch” guide must include
“You don’t need billions of parameters to learn the principles. A 10-million-parameter model on a Shakespeare corpus teaches the same lessons as GPT-4.” Part 2: Step-by-Step Implementation (Code-First) This is the heart of your PDF. Every serious “build from scratch” guide must include runable Python code . We’ll use PyTorch, but you could adapt to JAX or plain NumPy for educational purposes. Step 1: Tokenization – Byte Pair Encoding (BPE) Most modern LLMs use Byte Pair Encoding. Implement a simple version: Implement a simple version: import torch
import torch.nn.functional as F def scaled_dot_product_attention(query, key, value, mask=None): d_k = query.size(-1) scores = torch.matmul(query, key.transpose(-2, -1)) / (d_k ** 0.5) if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) attention_weights = F.softmax(scores, dim=-1) return torch.matmul(attention_weights, value)