-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.py
More file actions
106 lines (84 loc) · 5.04 KB
/
config.py
File metadata and controls
106 lines (84 loc) · 5.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""
config.py - Central Configuration for MiniGPT
==============================================
All hyperparameters live here in a single dataclass.
Every other module imports from this file, ensuring consistency.
Architecture target: ~95,568 parameters (approximately 100K).
"""
from dataclasses import dataclass
import os
@dataclass
class TransformerConfig:
# ──────────────────────────────────────────────────────────────
# MODEL ARCHITECTURE
# ──────────────────────────────────────────────────────────────
# Vocabulary size: top-2000 most frequent words from the corpus.
# Covers ~95% of all token occurrences in Tiny Shakespeare.
# Words outside this set become <unk>.
# This is the BIGGEST parameter cost: vocab_size * d_model = 64,000 params.
vocab_size: int = 2000
# Embedding dimension: each word is represented as a 32-dim vector.
# Small, but enough for our tiny model to learn basic patterns.
# In GPT-2, this is 768; in GPT-3, 12288. We use 32.
d_model: int = 32
# Number of attention heads: the d_model dimension is split into
# n_heads parallel attention mechanisms, each of size d_k = d_model / n_heads.
# Here: d_k = 32 / 4 = 8 dimensions per head.
n_heads: int = 4
# Number of transformer layers (blocks) stacked sequentially.
# Each block = multi-head attention + feed-forward network.
# GPT-2 uses 12-48 layers. We use 2 to stay within ~100K params.
n_layers: int = 2
# Feed-forward network inner dimension.
# The FFN expands from d_model -> d_ff -> d_model.
# Standard ratio is 4x (d_ff = 4 * d_model = 128), but for our
# parameter budget we use exactly 128 = 4 * 32.
d_ff: int = 128
# Maximum sequence length (context window).
# The model can "see" up to 128 tokens of history when predicting
# the next word. GPT-2 uses 1024, GPT-3 uses 2048.
max_seq_len: int = 128
# Dropout rate: randomly zeroes 10% of activations during training.
# This prevents overfitting by forcing the network to not rely
# on any single feature too heavily.
dropout: float = 0.1
# ──────────────────────────────────────────────────────────────
# TRAINING
# ──────────────────────────────────────────────────────────────
batch_size: int = 64 # Samples per gradient update
learning_rate: float = 3e-4 # Peak learning rate (after warmup)
min_lr: float = 3e-5 # Minimum LR at end of cosine decay
max_epochs: int = 20 # Maximum training epochs
warmup_steps: int = 200 # Linear warmup steps (stabilizes early training)
max_steps: int = 5000 # Total training steps
grad_clip: float = 1.0 # Max gradient norm (prevents exploding gradients)
weight_decay: float = 0.01 # L2 regularization on weight matrices only
eval_interval: int = 500 # Evaluate on validation set every N steps
log_interval: int = 100 # Print training stats every N steps
# ──────────────────────────────────────────────────────────────
# GENERATION
# ──────────────────────────────────────────────────────────────
temperature: float = 0.8 # <1 = more confident, >1 = more random
top_k: int = 40 # Only sample from top-k most likely tokens
max_gen_len: int = 100 # Maximum tokens to generate
# ──────────────────────────────────────────────────────────────
# PATHS
# ──────────────────────────────────────────────────────────────
data_dir: str = "data"
data_file: str = "input.txt"
vocab_file: str = "vocab.json"
checkpoint_dir: str = "checkpoints"
checkpoint_file: str = "model.pt"
@property
def data_path(self) -> str:
return os.path.join(self.data_dir, self.data_file)
@property
def vocab_path(self) -> str:
return os.path.join(self.data_dir, self.vocab_file)
@property
def checkpoint_path(self) -> str:
return os.path.join(self.checkpoint_dir, self.checkpoint_file)
@property
def d_k(self) -> int:
"""Dimension per attention head."""
return self.d_model // self.n_heads