LumGPT/lum_gpt.cpp at main · LumGenLab/LumGPT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#define _USE_MATH_DEFINES
#include <vector>
#include <random>
#include <cmath>
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <algorithm>
#include <numeric>
#include <cassert>
#include <memory>
#include <thread>
#include <mutex>
#include <chrono>
#include <iomanip>
#include <unordered_map>
#include <unordered_set>
#include <cstring>
#include <functional>
#include <limits> // For NaN checks
#include <random> // For sampling improvements

// --- Utility Functions and Constants ---
constexpr double EPSILON = 1e-5; // LayerNorm epsilon (GPT-2 paper)
constexpr double INF = 1e30;
constexpr double NEG_INF = -1e30;

// --- Random Number Generator ---
thread_local std::mt19937 rng(1337); // Fixed seed for reproducibility, thread-local

// --- Math Helpers ---
inline double uniform_rand(double low, double high) {
    std::uniform_real_distribution<double> dist(low, high);
    return dist(rng);
}

inline double normal_rand(double mean, double stddev) {
    std::normal_distribution<double> dist(mean, stddev);
    return dist(rng);
}

// --- Optimized Tensor Classes ---
// Flattened vector-based tensor for better cache performance and memory management
class Tensor {
public:
    std::vector<double> data;
    std::vector<size_t> shape;
    size_t size() const { return data.size(); }
    size_t ndim() const { return shape.size(); }
    size_t dim(size_t i) const {
        if (i >= shape.size()) {
             return 1; // Common default for broadcasting, but log warning conceptually
        }
        return shape[i];
    }

    // More robust default constructor
    Tensor() : data(), shape() {} // Explicitly initialize

    // Constructor with shape
    explicit Tensor(const std::vector<size_t>& s) : shape(s) {
        size_t total_size = 1;
        for (size_t d : shape) total_size *= d;
        data.resize(total_size, 0.0);
    }

    // Constructor with shape and initial value
    Tensor(const std::vector<size_t>& s, double val) : shape(s) {
        size_t total_size = 1;
        for (size_t d : shape) total_size *= d;
        data.resize(total_size, val);
    }

    double& operator[](size_t idx) { return data[idx]; }
    const double& operator[](size_t idx) const { return data[idx]; }

    // Calculate flat index from multi-dimensional indices
    size_t index(const std::vector<size_t>& indices) const {
        assert(indices.size() == shape.size());
        size_t idx = 0;
        size_t multiplier = 1;
        for (int i = static_cast<int>(shape.size()) - 1; i >= 0; --i) {
            idx += indices[i] * multiplier;
            multiplier *= shape[i];
        }
        return idx;
    }

    // Access element with bounds checking (simplified)
    double& at(const std::vector<size_t>& indices) {
        return data[index(indices)];
    }
    const double& at(const std::vector<size_t>& indices) const {
        return data[index(indices)];
    }

    void fill(double val) {
         std::fill(data.begin(), data.end(), val);
    }
    void zero() {
        if (!data.empty()) {
             std::memset(data.data(), 0, data.size() * sizeof(double));
        }
    }

    // Helper to check for NaN
    bool has_nan() const {
        for (const auto& val : data) {
            if (std::isnan(val)) return true;
        }
        return false;
    }

    // Assignment operator for correct copying
    Tensor& operator=(const Tensor& other) {
        if (this != &other) {
            shape = other.shape;
            data = other.data;
        }
        return *this;
    }
};

// --- Optimized Math Operations on Tensors ---
void fill_normal(Tensor& t, double mean, double stddev) {
    for (auto& val : t.data) {
        val = normal_rand(mean, stddev);
    }
}

// Y = X + b (Broadcast b over the last dimension of X)
void add_broadcast_last_dim(const Tensor& X, const Tensor& b, Tensor& Y) {
    // Assuming X is [..., N], b is [N], Y is [..., N]
    size_t N = X.dim(X.ndim() - 1);
    size_t num_elements = X.size();
    assert(b.size() == N);
    assert(Y.size() == num_elements);

    for (size_t i = 0; i < num_elements; ++i) {
        Y.data[i] = X.data[i] + b.data[i % N];
    }
}

// C = A * B (Matrix multiplication: [M, K] x [K, N] -> [M, N])
// Corrected version: Resizes C to the correct dimensions
void matmul(const Tensor& A, const Tensor& B, Tensor& C) {
    // Ensure inputs are 2D
    assert(A.ndim() == 2 && B.ndim() == 2);
    size_t M = A.dim(0);
    size_t K_A = A.dim(1);
    size_t K_B = B.dim(0);
    size_t N = B.dim(1);

    // Check inner dimensions match
    assert(K_A == K_B);
    size_t K = K_A; // == K_B

    // Resize C to the correct dimensions if necessary
    // This handles default-constructed tensors or tensors with wrong sizes
    if (C.shape.size() != 2 || C.dim(0) != M || C.dim(1) != N) {
        C = Tensor({M, N}); // Assign a new tensor with correct shape
    }
    // No need for the original assert(C.ndim() == 2) as we've ensured it
    // assert(C.dim(0) == M && C.dim(1) == N); // Also ensured by resize

    // Perform matrix multiplication
    // Zero C first to ensure correct accumulation
    C.zero();
    for (size_t i = 0; i < M; ++i) {
        for (size_t j = 0; j < N; ++j) {
            double sum = 0.0;
            for (size_t k = 0; k < K; ++k) {
                sum += A.data[i * K + k] * B.data[k * N + j];
            }
            C.data[i * N + j] = sum;
        }
    }
}

// C = A + B (Element-wise)
void add(const Tensor& A, const Tensor& B, Tensor& C) {
    assert(A.size() == B.size() && B.size() == C.size());
    for (size_t i = 0; i < A.size(); ++i) {
        C.data[i] = A.data[i] + B.data[i];
    }
}
void add_inplace(Tensor& A, const Tensor& B) {
    assert(A.size() == B.size());
    for (size_t i = 0; i < A.size(); ++i) {
        A.data[i] += B.data[i];
    }
}

// B = A^T (Transpose: [M, N] -> [N, M])
// Corrected version: Resizes AT to the correct dimensions
void transpose(const Tensor& A, Tensor& AT) {
    // Ensure input is 2D
    assert(A.ndim() == 2);
    size_t M = A.dim(0);
    size_t N = A.dim(1);

    // Resize AT to the correct dimensions if necessary
    // This handles default-constructed tensors or tensors with wrong sizes
    if (AT.shape.size() != 2 || AT.dim(0) != N || AT.dim(1) != M) {
        AT = Tensor({N, M}); // Assign a new tensor with correct shape
    }
    // No need for the original assert(A.ndim() == 2 && AT.ndim() == 2)
    // as we've ensured AT is 2D with correct dims
    // assert(AT.dim(0) == N && AT.dim(1) == M); // Also ensured by resize

    for (size_t i = 0; i < M; ++i) {
        for (size_t j = 0; j < N; ++j) {
            AT.data[j * M + i] = A.data[i * N + j];
        }
    }
}

// Element-wise GELU approximation (GPT-2 standard)
inline double gelu_scalar(double x) {
    const double sqrt_2_over_pi = std::sqrt(2.0 / M_PI);
    return 0.5 * x * (1.0 + std::tanh(sqrt_2_over_pi * (x + 0.044715 * x * x * x)));
}
void gelu(const Tensor& X, Tensor& Y) {
    assert(X.size() == Y.size());
    for (size_t i = 0; i < X.size(); ++i) {
        Y.data[i] = gelu_scalar(X.data[i]);
    }
}
void gelu_inplace(Tensor& X) {
    for (size_t i = 0; i < X.size(); ++i) {
        X.data[i] = gelu_scalar(X.data[i]);
    }
}

// Element-wise GELU gradient (approximation)
inline double gelu_grad_scalar(double x) {
    const double sqrt_2_over_pi = std::sqrt(2.0 / M_PI);
    double x3 = x * x * x;
    double inner = sqrt_2_over_pi * (x + 0.044715 * x3);
    double tanh_inner = std::tanh(inner);
    double sech2_inner = 1.0 - tanh_inner * tanh_inner; // sech^2(x) = 1 - tanh^2(x)
    double d_inner_dx = sqrt_2_over_pi * (1.0 + 3.0 * 0.044715 * x * x);
    return 0.5 * (1.0 + tanh_inner + x * d_inner_dx * sech2_inner);
}
// grad_X = grad_Y * gelu_grad(X)
void gelu_grad(const Tensor& X, const Tensor& grad_Y, Tensor& grad_X) {
    assert(X.size() == grad_Y.size() && grad_Y.size() == grad_X.size());
    for (size_t i = 0; i < X.size(); ++i) {
        grad_X.data[i] = grad_Y.data[i] * gelu_grad_scalar(X.data[i]);
    }
}

// Softmax with numerical stability (max trick) on the last dimension
void softmax(const Tensor& X, Tensor& Y) {
    // Assuming X and Y are [..., Vocab]
    size_t total_elements = X.size();
    size_t vocab_size = X.dim(X.ndim() - 1);
    size_t num_rows = total_elements / vocab_size; // Number of rows to softmax over

    for (size_t row = 0; row < num_rows; ++row) {
        size_t row_start = row * vocab_size;
        // Find max for numerical stability
        double max_val = X.data[row_start];
        for (size_t j = 1; j < vocab_size; ++j) {
            if (X.data[row_start + j] > max_val) {
                max_val = X.data[row_start + j];
            }
        }
        // Compute exp and sum
        double sum = 0.0;
        for (size_t j = 0; j < vocab_size; ++j) {
            Y.data[row_start + j] = std::exp(X.data[row_start + j] - max_val);
            sum += Y.data[row_start + j];
        }
        // Normalize
        for (size_t j = 0; j < vocab_size; ++j) {
            Y.data[row_start + j] /= sum;
        }
    }
}

// Combined Softmax + Cross-Entropy Gradient: grad_logits = probs - true_dist
// This is the mathematically correct and numerically stable gradient.
void softmax_crossentropy_grad(const Tensor& probs, const Tensor& y_true_one_hot, Tensor& grad_logits) {
     assert(probs.size() == y_true_one_hot.size() && y_true_one_hot.size() == grad_logits.size());
     for(size_t i = 0; i < probs.size(); ++i) {
         grad_logits.data[i] = probs.data[i] - y_true_one_hot.data[i];
     }
}

// --- Core Components ---

// Linear Layer: Y = X @ W + b
class Linear {
public:
    Tensor W, b; // Weights [out_features, in_features], Bias [out_features]
    Tensor grad_W, grad_b; // Gradients
    Tensor input_buffer; // Stored for backward pass [batch_size, in_features]

    Linear(size_t in_features, size_t out_features)
        : W({out_features, in_features}),
          b({out_features}),
          grad_W({out_features, in_features}, 0.0),
          grad_b({out_features}, 0.0) {
        // Xavier/Glorot init for weights
        double stddev = std::sqrt(2.0 / (in_features + out_features));
        fill_normal(W, 0.0, stddev);
        fill_normal(b, 0.0, stddev); // Initializing bias to 0 is also common
    }

    void forward(const Tensor& X, Tensor& Y) { // Y: [batch_size, out_features]
        input_buffer = X; // Store for backward pass
        Tensor WT;
        transpose(W, WT); // [in_features, out_features]
        matmul(X, WT, Y); // [batch_size, in_features] x [in_features, out_features] -> [batch_size, out_features]
        add_broadcast_last_dim(Y, b, Y); // Add bias
    }

    // Full backward pass implementing the chain rule precisely as per document derivations
    // dL/dX = dL/dY * W
    // dL/dW = sum_over_batch( X^T * dL/dY )
    // dL/db = sum_over_batch( dL/dY )
    void backward(const Tensor& grad_Y, Tensor& grad_X) { // grad_Y: [batch_size, out_features]
        size_t batch_size = grad_Y.dim(0);
        size_t in_features = W.dim(1);
        size_t out_features = W.dim(0);
        // --- CRITICAL FIX: Ensure grad_X is correctly sized ---
        if (grad_X.shape.size() != 2 || grad_X.dim(0) != batch_size || grad_X.dim(1) != in_features) {
            grad_X = Tensor({batch_size, in_features}); // Resize/assign correctly
        }
        // --- END CRITICAL FIX ---
        assert(grad_X.dim(0) == batch_size && grad_X.dim(1) == in_features);

        // 1. dL/dX = dL/dY * W
        // grad_Y: [batch_size, out_features], W: [out_features, in_features] -> grad_X: [batch_size, in_features]
        matmul(grad_Y, W, grad_X); // Matrix multiplication handles the sum over out_features

        // 2. dL/dW = sum_over_batch( X^T * dL/dY )
        // input_buffer (X): [batch_size, in_features], grad_Y: [batch_size, out_features]
        // X^T: [in_features, batch_size] * grad_Y: [batch_size, out_features] -> [in_features, out_features]
        Tensor XT;
        transpose(input_buffer, XT); // [in_features, batch_size]
        Tensor grad_W_local({in_features, out_features});
        matmul(XT, grad_Y, grad_W_local); // Result: [in_features, out_features]
        Tensor grad_W_local_T; // Need to transpose to match W's shape [out_features, in_features]
        transpose(grad_W_local, grad_W_local_T);
        add_inplace(grad_W, grad_W_local_T); // Accumulate into grad_W

        // 3. dL/db = sum_over_batch( dL/dY )
        // Sum gradients over the batch dimension
        for (size_t i = 0; i < batch_size; ++i) {
            for (size_t j = 0; j < out_features; ++j) {
                grad_b.data[j] += grad_Y.data[i * out_features + j];
            }
        }
    }

    void zero_grad() {
        grad_W.zero();
        grad_b.zero();
    }
};

// Layer Normalization (Pre-LN as in GPT)
class LayerNorm {
public:
    size_t normalized_shape; // Size of the last dimension to normalize over
    Tensor gamma, beta; // Learnable parameters [normalized_shape]
    Tensor grad_gamma, grad_beta; // Gradients

    // Buffers for backward pass to store intermediate computations
    Tensor x_mu_buffer;        // [batch_size, seq_len, normalized_shape] - (x - mean)
    Tensor x_sigma_inv_buffer; // [batch_size, seq_len] - 1 / sqrt(variance + eps)
    Tensor x_norm_buffer;      // [batch_size, seq_len, normalized_shape] - normalized input (x_hat)

    LayerNorm(size_t features)
        : normalized_shape(features),
          gamma({features}, 1.0),
          beta({features}, 0.0),
          grad_gamma({features}, 0.0),
          grad_beta({features}, 0.0) {
        // Initialize gamma and beta as per GPT-2/GPT-3 practices
        fill_normal(gamma, 1.0, 0.02);
        // --- CORRECT INITIALIZATION: Beta initialized to 0 ---
        beta.fill(0.0); // Explicitly fill beta with 0.0
        // --- END CORRECT INITIALIZATION ---
    }

    void forward(const Tensor& X, Tensor& Y) { // X, Y: [..., normalized_shape]
        Y = X; // Y inherits shape from X
        size_t total_elements = X.size();
        size_t N = normalized_shape; // features
        assert(total_elements % N == 0);
        size_t num_groups = total_elements / N; // Number of groups (e.g., batch*seq_len) to normalize

        // Ensure buffers are correctly sized
        x_mu_buffer = X; // Reuse buffer memory
        x_norm_buffer = X; // Reuse buffer memory
        x_sigma_inv_buffer = Tensor({num_groups}); // Allocate buffer for sigma_inv per group

        for (size_t g = 0; g < num_groups; ++g) {
            size_t group_start = g * N;

            // 1. Calculate mean: μ = (1/N) * Σ(x_i)
            double sum = 0.0;
            for (size_t i = 0; i < N; ++i) {
                sum += X.data[group_start + i];
            }
            double mean = sum / N;

            // 2. Calculate variance and x_mu: σ^2 = (1/N) * Σ(x_i - μ)^2
            double sum_sq_diff = 0.0;
            for (size_t i = 0; i < N; ++i) {
                double diff = X.data[group_start + i] - mean;
                x_mu_buffer.data[group_start + i] = diff; // Store (x_i - μ)
                sum_sq_diff += diff * diff;
            }
            double variance = sum_sq_diff / N;

            // 3. Calculate standard deviation inverse: 1 / sqrt(σ^2 + ε)
            double sigma = std::sqrt(variance + EPSILON);
            x_sigma_inv_buffer.data[g] = 1.0 / sigma; // Store 1/σ

            // 4. Normalize, scale, and shift: y_i = γ * (x_i - μ) / σ + β
            for (size_t i = 0; i < N; ++i) {
                x_norm_buffer.data[group_start + i] = x_mu_buffer.data[group_start + i] * x_sigma_inv_buffer.data[g]; // x_hat
                Y.data[group_start + i] = gamma.data[i] * x_norm_buffer.data[group_start + i] + beta.data[i];
            }
        }
    }

    // Full backward pass implementing the precise mathematical derivation for LayerNorm gradients
    // dL/dγ = Σ_features( dL/dy_i * x_hat_i )
    // dL/dβ = Σ_features( dL/dy_i )
    // dL/dx_i = (σ_inv / N) * ( N * dL/dy_i * γ_i - Σ(dL/dy_j * γ_j) - x_hat_i * Σ(dL/dy_j * γ_j * x_hat_j) )
    void backward(const Tensor& grad_Y, Tensor& grad_X) { // grad_Y, grad_X: [..., normalized_shape]
        size_t total_elements = grad_Y.size();
        size_t N = normalized_shape; // features
        assert(total_elements % N == 0);
        size_t num_groups = total_elements / N;

        // --- CRITICAL FIX: Ensure grad_X is correctly sized ---
        if (grad_X.shape.size() != grad_Y.shape.size()) {
            grad_X = Tensor(grad_Y.shape); // Match shape if ndim is different
        } else {
            bool shape_mismatch = false;
            for (size_t i = 0; i < grad_Y.ndim(); ++i) {
                if (grad_X.dim(i) != grad_Y.dim(i)) {
                    shape_mismatch = true;
                    break;
                }
            }
            if (shape_mismatch) {
                grad_X = Tensor(grad_Y.shape); // Resize if shape mismatches
            }
        }
        // --- END CRITICAL FIX ---

        grad_gamma.zero(); // Zero gradients for parameters
        grad_beta.zero();

        for (size_t g = 0; g < num_groups; ++g) {
            size_t group_start = g * N;
            double sigma_inv = x_sigma_inv_buffer.data[g]; // Retrieve pre-computed 1/σ

            // Compute intermediate sums needed for dx_i calculation
            double sum_dy_gamma = 0.0;      // Σ(dL/dy_j * γ_j)
            double sum_dy_gamma_xnorm = 0.0; // Σ(dL/dy_j * γ_j * x_hat_j)
            for (size_t i = 0; i < N; ++i) {
                double dy_gamma = grad_Y.data[group_start + i] * gamma.data[i];
                sum_dy_gamma += dy_gamma;
                sum_dy_gamma_xnorm += dy_gamma * x_norm_buffer.data[group_start + i];
            }

            // Compute gradients for learnable parameters for this group
            for (size_t i = 0; i < N; ++i) {
                grad_gamma.data[i] += grad_Y.data[group_start + i] * x_norm_buffer.data[group_start + i]; // dL/dγ
                grad_beta.data[i] += grad_Y.data[group_start + i]; // dL/dβ
            }

            // Compute gradient w.r.t. input x for this group using the derived formula
            // dL/dx_i = (σ_inv / N) * ( N * dL/dy_i * γ_i - sum_dy_gamma - x_hat_i * sum_dy_gamma_xnorm )
            for (size_t i = 0; i < N; ++i) {
                double dy_gamma_i = grad_Y.data[group_start + i] * gamma.data[i];
                grad_X.data[group_start + i] = sigma_inv * (dy_gamma_i - (sum_dy_gamma + x_norm_buffer.data[group_start + i] * sum_dy_gamma_xnorm) / static_cast<double>(N));
            }
        }
    }

    void zero_grad() {
        grad_gamma.zero();
        grad_beta.zero();
    }
};

// Multi-Head Attention (Causal, Pre-LN, GPT-2 style) - FULLY CORRECTED
class MultiHeadAttention {
public:
    int d_model, n_heads, d_k;
    Linear c_attn; // Combined linear projection for Q, K, V: [d_model] -> [3 * d_model]
    Linear c_proj; // Output projection: [d_model] -> [d_model]

    // Buffers for forward/backward passes to store intermediate states for ALL heads
    Tensor qkv_buffer;           // [seq_len, 3 * d_model] - Combined QKV output from c_attn

    // These are now 3D tensors to store data for each head
    std::vector<Tensor> q_heads; // [n_heads] of [seq_len, d_k] - Q for each head
    std::vector<Tensor> k_heads; // [n_heads] of [seq_len, d_k] - K for each head
    std::vector<Tensor> v_heads; // [n_heads] of [seq_len, d_k] - V for each head
    std::vector<Tensor> attn_logits_heads; // [n_heads] of [seq_len, seq_len] - Pre-softmax scores for each head
    std::vector<Tensor> attn_weights_heads; // [n_heads] of [seq_len, seq_len] - Post-softmax weights for each head

    Tensor stored_input;         // [seq_len, d_model] - Input to forward pass (for backward)

    MultiHeadAttention(int d_model, int n_heads)
        : d_model(d_model), n_heads(n_heads), d_k(d_model / n_heads),
          c_attn(d_model, 3 * d_model), // Single dense layer for all Q, K, V projections
          c_proj(d_model, d_model) {    // Output projection layer
        // Initialize the 3D storage for each head
        q_heads.resize(n_heads);
        k_heads.resize(n_heads);
        v_heads.resize(n_heads);
        attn_logits_heads.resize(n_heads);
        attn_weights_heads.resize(n_heads);
    }

    void forward(const Tensor& X, Tensor& Y) { // X, Y: [seq_len, d_model]
        stored_input = X; // Store input for backward pass
        size_t seq_len = X.dim(0);

        // --- CRITICAL FIX: Ensure Y is correctly sized and zeroed ---
        // The caller might pass a default Tensor {}, or one with wrong dims.
        // We must ensure Y is [seq_len, d_model] and initialized to zero.
        if (Y.shape.size() != 2 || Y.dim(0) != seq_len || Y.dim(1) != static_cast<size_t>(d_model)) {
            Y = Tensor({seq_len, static_cast<size_t>(d_model)}); // Resize/assign correctly
        }
        Y.zero(); // Essential: Initialize output to zero before accumulation
        // --- END CRITICAL FIX ---

        // 1. Project input to Q, K, V using a single combined linear layer
        c_attn.forward(X, qkv_buffer); // qkv_buffer: [seq_len, 3*d_model]

        // 2. Initialize output tensor Y to zero (already done above)

        // 3. Process each attention head
        for (int h = 0; h < n_heads; ++h) {
            int head_offset = h * d_k; // Offset in the d_model dimension for this head

            // 4. Extract Q, K, V for this head from the combined qkv_buffer
            // Conceptually, we slice qkv_buffer into Q_head, K_head, V_head [seq_len, d_k]
            // For efficiency, we work directly with indices.
            Tensor Q_head({seq_len, static_cast<size_t>(d_k)});
            Tensor K_head({seq_len, static_cast<size_t>(d_k)});
            Tensor V_head({seq_len, static_cast<size_t>(d_k)});

            for(size_t t = 0; t < seq_len; ++t) {
                for(int j = 0; j < d_k; ++j) {
                    Q_head.data[t * d_k + j] = qkv_buffer.data[t * 3 * d_model + 0 * d_model + head_offset + j]; // Q slice
                    K_head.data[t * d_k + j] = qkv_buffer.data[t * 3 * d_model + 1 * d_model + head_offset + j]; // K slice
                    V_head.data[t * d_k + j] = qkv_buffer.data[t * 3 * d_model + 2 * d_model + head_offset + j]; // V slice
                }
            }

            // 5. Compute Scaled Dot-Product Attention Scores: Q @ K^T / sqrt(d_k)
            // CORRECTED: Must transpose K_head before multiplication
            Tensor scores({seq_len, seq_len});
            Tensor K_head_T;
            transpose(K_head, K_head_T); // Transpose K_head from [seq_len, d_k] to [d_k, seq_len]
            matmul(Q_head, K_head_T, scores); // [seq_len, d_k] x [d_k, seq_len] -> [seq_len, seq_len]

            double scale = 1.0 / std::sqrt(static_cast<double>(d_k));
            for (auto& val : scores.data) {
                val *= scale; // Apply scaling factor
            }

            // 6. Apply Causal Mask: Set future positions to -inf
            for (size_t i = 0; i < seq_len; ++i) {
                for (size_t j = i + 1; j < seq_len; ++j) {
                    scores.data[i * seq_len + j] = NEG_INF; // Mask future positions
                }
            }

            // 7. Store pre-softmax scores for this head
            attn_logits_heads[h] = scores; // Store for backward pass

            // 8. Apply Softmax to get attention weights: A = softmax(scores)
            Tensor attn({seq_len, seq_len});
            softmax(scores, attn); // Apply numerically stable softmax

            // 9. Store post-softmax weights for this head
            attn_weights_heads[h] = attn;

            // 10. Apply attention weights to values: Output_head = A @ V
            Tensor head_out({seq_len, static_cast<size_t>(d_k)});
            matmul(attn, V_head, head_out); // [seq_len, seq_len] x [seq_len, d_k] -> [seq_len, d_k]

            // 11. Accumulate head outputs into the final output Y
            // CRITICAL: This loop was the source of the segfault.
            // It now correctly writes to the pre-initialized and zeroed Y tensor.
            for (size_t t = 0; t < seq_len; ++t) {
                for (int j = 0; j < d_k; ++j) {
                    // Y is guaranteed to be [seq_len, d_model] and zeroed.
                    Y.data[t * d_model + head_offset + j] += head_out.data[t * d_k + j];
                }
            }

            // 12. Store the Q, K, V tensors for this head in the class members
            q_heads[h] = Q_head;
            k_heads[h] = K_head;
            v_heads[h] = V_head;
        }

        // 13. Apply final output projection: Y = Y @ W_O + b_O
        Tensor Y_temp = Y; // Use Y as temp buffer for input to c_proj
        c_proj.forward(Y_temp, Y); // Final projection
    }

    // Full backward pass for Multi-Head Attention implementing precise matrix calculus for ALL heads
    // This follows the chain rule through softmax, matrix multiplication, and linear layers.
    void backward(const Tensor& grad_output, Tensor& grad_input) { // grad_output, grad_input: [seq_len, d_model]
        size_t seq_len = grad_output.dim(0);
        // --- CRITICAL FIX: Ensure grad_input is correctly sized ---
        if (grad_input.shape.size() != 2 || grad_input.dim(0) != seq_len || grad_input.dim(1) != static_cast<size_t>(d_model)) {
            grad_input = Tensor({seq_len, static_cast<size_t>(d_model)}); // Resize/assign correctly
        }
        // --- END CRITICAL FIX ---
        grad_input = grad_output; // Match shape for intermediate results

        // 1. Backward through output projection c_proj
        // dL/d(concat_head_out) = dL/dY * W_O^T
        Tensor grad_head_concat({seq_len, static_cast<size_t>(d_model)}); // Gradient w.r.t. concatenated head outputs
        c_proj.backward(grad_input, grad_head_concat); // grad_input is overwritten with grad wrt c_proj input
        grad_input = grad_head_concat; // Grad w.r.t. concat head outputs

        // 2. Initialize gradients for combined QKV projection
        Tensor grad_qkv_combined({seq_len, static_cast<size_t>(3 * d_model)}, 0.0); // [seq_len, 3*d_model]

        // 3. Backward through each attention head
        for (int h = 0; h < n_heads; ++h) {
            int head_offset = h * d_k;

            // 4. Slice gradient for this head's contribution from grad_head_concat
            Tensor grad_head_out({seq_len, static_cast<size_t>(d_k)}); // [seq_len, d_k]
            for(size_t t = 0; t < seq_len; ++t) {
                for(int j = 0; j < d_k; ++j) {
                     grad_head_out.data[t * d_k + j] = grad_input.data[t * d_model + head_offset + j];
                }
            }

            // 5. Retrieve stored Q, K, V for this head (from class members)
            const Tensor& Q_head = q_heads[h]; // [seq_len, d_k]
            const Tensor& K_head = k_heads[h]; // [seq_len, d_k]
            const Tensor& V_head = v_heads[h]; // [seq_len, d_k]

            // 6. Retrieve stored attention weights for this head
            const Tensor& attn = attn_weights_heads[h]; // [seq_len, seq_len] - Post-softmax weights
            const Tensor& scores = attn_logits_heads[h]; // Pre-softmax scores (for reference if needed)

            // --- Backprop through: head_out = attn @ V_head ---
            // 7. dL/dV_head = attn^T @ dL/d(head_out)
            // This multiplication is correct: [seq_len, seq_len] x [seq_len, d_k] -> [seq_len, d_k]
            Tensor grad_V_head({seq_len, static_cast<size_t>(d_k)}); // [seq_len, d_k]
            Tensor AT; // [seq_len, seq_len]
            transpose(attn, AT); // Transpose attention weights
            matmul(AT, grad_head_out, grad_V_head); // Matrix multiply to get dV

            // 8. dL/d(attn) = dL/d(head_out) @ V_head^T
            // CORRECTED: Must transpose V_head before multiplication
            Tensor grad_attn({seq_len, seq_len}); // [seq_len, seq_len]
            // OLD: Tensor VT; transpose(V_head, VT); matmul(grad_head_out, VT, grad_attn);
            // CORRECT:
            Tensor V_head_T;
            transpose(V_head, V_head_T); // Transpose V_head from [seq_len, d_k] to [d_k, seq_len]
            matmul(grad_head_out, V_head_T, grad_attn); // [seq_len, d_k] x [d_k, seq_len] -> [seq_len, seq_len]

            // --- Backprop through: attn = softmax(scores) ---
            // 9. Apply causal mask to grad_attn (zero out gradients for masked positions)
            for (size_t i = 0; i < seq_len; ++i) {
                for (size_t j = i + 1; j < seq_len; ++j) {
                    grad_attn.data[i * seq_len + j] = 0.0; // Mask future positions
                }
            }

            // 10. dL/d(scores) = softmax_grad(A, dL/dA)
            // Standard softmax gradient: dS_ij = A_ij * (dA_ij - sum_k(dA_ik * A_ik))
            // This is the gradient of the softmax function itself.
            Tensor grad_scores({seq_len, seq_len}); // [seq_len, seq_len]
            for (size_t i = 0; i < seq_len; ++i) {
                double sum = 0.0;
                for (size_t k = 0; k < seq_len; ++k) {
                    sum += grad_attn.data[i * seq_len + k] * attn.data[i * seq_len + k]; // sum_k(dA_ik * A_ik)
                }
                for (size_t j = 0; j < seq_len; ++j) {
                    // dS_ij = A_ij * (dA_ij - sum_k(dA_ik * A_ik))
                    grad_scores.data[i * seq_len + j] = attn.data[i * seq_len + j] * (grad_attn.data[i * seq_len + j] - sum);
                }
            }

            // 11. Apply scaling factor to gradient w.r.t. scores
            double scale = 1.0 / std::sqrt(static_cast<double>(d_k));
            for (auto& val : grad_scores.data) {
                val *= scale; // dL/d(scores) = dL/d(scores_unscaled) / sqrt(d_k)
            }

            // --- Backprop through: scores = Q_head @ K_head^T / sqrt(d_k) ---
            // 12. dL/dQ_head = dL/d(scores) @ K_head
            // grad_scores: [seq_len, seq_len], K_head: [seq_len, d_k] -> grad_Q_head: [seq_len, d_k]
            // This multiplication is correct: [seq_len, seq_len] x [seq_len, d_k] -> [seq_len, d_k]
            Tensor grad_Q_head({seq_len, static_cast<size_t>(d_k)}); // [seq_len, d_k]
            matmul(grad_scores, K_head, grad_Q_head); // dQ = dS @ K

            // 13. dL/dK_head = dL/d(scores)^T @ Q_head
            // grad_scores^T: [seq_len, seq_len], Q_head: [seq_len, d_k] -> grad_K_head: [seq_len, d_k]
            // This multiplication is correct: [seq_len, seq_len] x [seq_len, d_k] -> [seq_len, d_k]
            Tensor grad_K_head({seq_len, static_cast<size_t>(d_k)}); // [seq_len, d_k]
            Tensor grad_scores_T; // [seq_len, seq_len]
            transpose(grad_scores, grad_scores_T); // Transpose dL/d(scores)
            matmul(grad_scores_T, Q_head, grad_K_head); // dK = dS^T @ Q

            // 14. Accumulate gradients w.r.t. Q, K, V for this head into the combined gradient tensor
            // These gradients will be used to compute gradients for the c_attn layer.
            for(size_t t = 0; t < seq_len; ++t) {
                for(int j = 0; j < d_k; ++j) {
                    // Accumulate dQ, dK, dV into the appropriate slices of grad_qkv_combined
                    grad_qkv_combined.data[t * 3 * d_model + 0 * d_model + head_offset + j] += grad_Q_head.data[t * d_k + j]; // dQ
                    grad_qkv_combined.data[t * 3 * d_model + 1 * d_model + head_offset + j] += grad_K_head.data[t * d_k + j]; // dK
                    grad_qkv_combined.data[t * 3 * d_model + 2 * d_model + head_offset + j] += grad_V_head.data[t * d_k + j]; // dV
                }
            }
        }

        // 15. Backward through combined linear projection (c_attn)
        // This computes dL/dX (input to attention) and accumulates dL/dW and dL/db for c_attn.
        c_attn.backward(grad_qkv_combined, grad_input); // grad_input now holds dL/dX
    }

    void zero_grad() {
        c_attn.zero_grad();
        c_proj.zero_grad();
    }
};

// Feed-Forward Network (Pre-LN, GELU, GPT-2 style)
class FeedForward {
public:
    Linear c_fc;   // Fully connected layer (d_model -> 4 * d_model)
    Linear c_proj; // Projection layer (4 * d_model -> d_model)
    Tensor gelu_input_buffer; // [seq_len, 4 * d_model] - Stored for backward pass (pre-GELU activation)

    FeedForward(int d_model)
        : c_fc(d_model, 4 * d_model),  // Expand to 4x width
          c_proj(4 * d_model, d_model) {} // Project back to d_model

    void forward(const Tensor& X, Tensor& Y) { // X, Y: [seq_len, d_model]
        // 1. Linear transformation: hidden = X @ W_fc + b_fc
        c_fc.forward(X, gelu_input_buffer); // gelu_input_buffer: [seq_len, 4 * d_model]

        // 2. Apply GELU activation element-wise
        gelu_inplace(gelu_input_buffer); // Apply GELU in place to gelu_input_buffer

        // 3. Output projection: Y = hidden @ W_proj + b_proj
        c_proj.forward(gelu_input_buffer, Y); // Y: [seq_len, d_model]
    }

    // Full backward pass for Feed-Forward Network
    void backward(const Tensor& grad_output, Tensor& grad_input) { // grad_output, grad_input: [seq_len, d_model]
        // --- CRITICAL FIX: Ensure grad_input is correctly sized ---
        size_t seq_len = grad_output.dim(0);
        if (grad_input.shape.size() != 2 || grad_input.dim(0) != seq_len || grad_input.dim(1) != c_fc.W.dim(1)) { // c_fc.W.dim(1) is d_model
             grad_input = Tensor({seq_len, c_fc.W.dim(1)}); // Resize correctly [seq_len, d_model]
        }
        // --- END CRITICAL FIX ---

        // 1. Backward through output projection c_proj
        // dL/d(hidden) = dL/dY * W_proj^T
        c_proj.backward(grad_output, grad_input); // grad_input now holds dL/d(hidden) [seq_len, 4*d_model]

        // 2. Backward through GELU activation
        // dL/d(gelu_input) = dL/d(hidden) * gelu_grad(gelu_input)
        // grad_input currently holds dL/d(hidden), gelu_input_buffer holds the pre-GELU values.
        Tensor grad_gelu_input = grad_input; // Alias for clarity
        gelu_grad(gelu_input_buffer, grad_gelu_input, grad_input); // grad_input now holds dL/d(gelu_input) [seq_len, 4*d_model]

        // 3. Backward through first linear layer c_fc
        // dL/dX = dL/d(gelu_input) * W_fc^T
        // This also accumulates gradients for c_fc's weights and biases.
        Tensor grad_X_temp({grad_input.dim(0), c_fc.W.dim(1)}); // [seq_len, d_model]
        c_fc.backward(grad_input, grad_X_temp); // grad_X_temp holds dL/dX
        grad_input = std::move(grad_X_temp); // Assign final gradient w.r.t. FFN input
    }

    void zero_grad() {
        c_fc.zero_grad();
        c_proj.zero_grad();
    }
};

// Decoder Block (Pre-LN as in GPT-2/3) - FULLY CORRECTED
class DecoderBlock {
public:
    int d_model, n_heads, d_k; // Store d_k for use in backward pass
    LayerNorm ln_1; // Pre-attention layer norm
    MultiHeadAttention attn; // Self-attention mechanism
    LayerNorm ln_2; // Pre-FFN layer norm
    FeedForward ff; // Feed-forward network

    // Buffers for intermediate computations during forward/backward
    // These are member variables that need to be correctly sized on each forward/backward pass.
    Tensor ln1_out_buffer, attn_out_buffer, residual1_buffer;
    Tensor ln2_out_buffer, ff_out_buffer;

    DecoderBlock(int d_model, int n_heads)
        : d_model(d_model), n_heads(n_heads), d_k(d_model / n_heads),
          ln_1(d_model),
          attn(d_model, n_heads),
          ln_2(d_model),
          ff(d_model) {}

    void forward(const Tensor& X, Tensor& Y) { // X, Y: [seq_len, d_model]
        size_t seq_len = X.dim(0); // Get the dynamic sequence length

        // --- CRITICAL FIX: Ensure all member buffers are correctly sized ---
        // This is the key fix identified by the gdb segfault analysis.
        // Default-constructed Tensors have shape {} and empty data.
        // If passed to sub-components without resizing, they cause crashes.
        if (ln1_out_buffer.shape.size() != 2 || ln1_out_buffer.dim(0) != seq_len || ln1_out_buffer.dim(1) != static_cast<size_t>(d_model)) {
            ln1_out_buffer = Tensor({seq_len, static_cast<size_t>(d_model)});
        }
        if (attn_out_buffer.shape.size() != 2 || attn_out_buffer.dim(0) != seq_len || attn_out_buffer.dim(1) != static_cast<size_t>(d_model)) {
            attn_out_buffer = Tensor({seq_len, static_cast<size_t>(d_model)});
        }
        if (residual1_buffer.shape.size() != 2 || residual1_buffer.dim(0) != seq_len || residual1_buffer.dim(1) != static_cast<size_t>(d_model)) {
            residual1_buffer = Tensor({seq_len, static_cast<size_t>(d_model)});
        }
        if (ln2_out_buffer.shape.size() != 2 || ln2_out_buffer.dim(0) != seq_len || ln2_out_buffer.dim(1) != static_cast<size_t>(d_model)) {
            ln2_out_buffer = Tensor({seq_len, static_cast<size_t>(d_model)});
        }
        if (ff_out_buffer.shape.size() != 2 || ff_out_buffer.dim(0) != seq_len || ff_out_buffer.dim(1) != static_cast<size_t>(d_model)) {
            ff_out_buffer = Tensor({seq_len, static_cast<size_t>(d_model)});
        }
        // --- END CRITICAL FIX ---

        // 1. Pre-Layer Normalization + Multi-Head Attention
        ln_1.forward(X, ln1_out_buffer); // Apply LN to input X
        attn.forward(ln1_out_buffer, attn_out_buffer); // Compute attention on normalized input
        add(X, attn_out_buffer, residual1_buffer); // Residual connection: X + attn_out

        // 2. Pre-Layer Normalization + Feed-Forward Network
        ln_2.forward(residual1_buffer, ln2_out_buffer); // Apply LN to residual1
        ff.forward(ln2_out_buffer, ff_out_buffer); // Compute FFN on normalized residual
        add(residual1_buffer, ff_out_buffer, Y); // Final residual connection: residual1 + ff_out -> Y
    }

    // Full backward pass for Decoder Block chaining gradients through sub-components
    void backward(const Tensor& grad_output, Tensor& grad_input) { // grad_output, grad_input: [seq_len, d_model]
        size_t seq_len = grad_output.dim(0);
        // --- CRITICAL FIX: Ensure gradient tensors passed to sub-module backward() are correctly sized ---
        // Default-constructed tensors passed as grad_X will fail assertions in sub-modules.
        // We must pre-size them.
        Tensor grad_ln2_out = grad_output; // [seq_len, d_model] - Alias/Copy is okay for initial value
        Tensor grad_ff_out = grad_output;  // [seq_len, d_model] - Alias/Copy is okay for initial value
        Tensor grad_residual1({seq_len, static_cast<size_t>(d_model)}); // MUST BE PRE-SIZED [seq_len, d_model]
        // For the residual split, we conceptually split grad_residual1.
        // The simplest and safest way is to copy it.
        Tensor grad_ln1_out({seq_len, static_cast<size_t>(d_model)}); // MUST BE PRE-SIZED [seq_len, d_model]
        Tensor grad_attn_out({seq_len, static_cast<size_t>(d_model)}); // MUST BE PRE-SIZED [seq_len, d_model]
        // --- END CRITICAL FIX ---

        // 1. Backward through second residual connection (Y = residual1 + ff_out)
        // Gradient splits equally to both branches of the residual connection.
        // In code, we pass grad_output (via grad_ln2_out/grad_ff_out) to the consumers.

        // 2. Backward through Feed-Forward Network
        // This computes dL/d(ln2_out) and accumulates gradients within the FFN.
        ff.backward(grad_ff_out, grad_ln2_out); // grad_ln2_out is updated with dL/d(ln2_out)

        // 3. Backward through second LayerNorm (pre-FFN)
        // This computes dL/d(residual1) and accumulates gradients for LN2 parameters.
        // grad_residual1 is now correctly sized [seq_len, d_model]
        ln_2.backward(grad_ln2_out, grad_residual1); // grad_residual1 holds dL/d(residual1)

        // 4. Backward through first residual connection (residual1 = X + attn_out)
        // Gradient splits to the main input X and the attention output branch.
        // We copy this gradient to both downstream consumers.
        // grad_ln1_out and grad_attn_out are now correctly sized [seq_len, d_model]
        grad_ln1_out = grad_residual1; // Copy data to correctly sized tensor
        grad_attn_out = grad_residual1; // Copy data to correctly sized tensor

        // 5. Backward through Multi-Head Attention
        // This computes dL/d(ln1_out) and accumulates gradients within the attention mechanism.
        attn.backward(grad_attn_out, grad_ln1_out); // grad_ln1_out is updated with dL/d(ln1_out)

        // 6. Backward through first LayerNorm (pre-attention)
        // This computes the final dL/dX and accumulates gradients for LN1 parameters.
        ln_1.backward(grad_ln1_out, grad_input); // grad_input holds the final dL/dX for this block
    }

    void zero_grad() {
        ln_1.zero_grad();
        attn.zero_grad();
        ln_2.zero_grad();
        ff.zero_grad();
    }
};

// Embedding Layers
class Embedding {
public:
    int num_embeddings, embedding_dim;
    Tensor weight; // [num_embeddings, embedding_dim] - Token embedding matrix
    std::vector<int> input_ids_buffer; // Stored input IDs for backward pass
    Tensor grad_weight; // [num_embeddings, embedding_dim] - Gradients w.r.t. embedding weights

    Embedding(int num_embeddings, int embedding_dim)
        : num_embeddings(num_embeddings), embedding_dim(embedding_dim),
          weight({static_cast<size_t>(num_embeddings), static_cast<size_t>(embedding_dim)}),
          grad_weight({static_cast<size_t>(num_embeddings), static_cast<size_t>(embedding_dim)}, 0.0) {
        // Xavier initialization for embeddings
        double stddev = std::sqrt(1.0 / embedding_dim);
        fill_normal(weight, 0.0, stddev);
    }

    void forward(const std::vector<int>& input_ids, Tensor& output) { // output: [seq_len, embedding_dim]
        input_ids_buffer = input_ids; // Store for backward pass
        size_t seq_len = input_ids.size();
        // For each token ID in the input sequence, lookup its embedding vector.
        for (size_t i = 0; i < seq_len; ++i) {
            int id = input_ids[i];
            if (id >= 0 && id < num_embeddings) {
                // Copy the embedding vector for token 'id' to the output at position 'i'.
                std::memcpy(&output.data[i * embedding_dim], &weight.data[id * embedding_dim], embedding_dim * sizeof(double));
            } else {
                // Handle out-of-vocabulary or padding tokens by setting embedding to zero.
                std::memset(&output.data[i * embedding_dim], 0, embedding_dim * sizeof(double));
            }
        }
    }

    // Backward pass for Embedding layer (sparse gradient update)
    // The gradient w.r.t. the embedding weights is sparse: only rows corresponding
    // to input token IDs receive non-zero gradients.
    void backward(const Tensor& grad_output) { // grad_output: [seq_len, embedding_dim]
        size_t seq_len = input_ids_buffer.size();
        // For each position in the input sequence, accumulate the gradient
        // from grad_output into the corresponding row of grad_weight.
        for (size_t i = 0; i < seq_len; ++i) {
            int id = input_ids_buffer[i];
            if (id >= 0 && id < num_embeddings) {
                // Accumulate gradients into the row of grad_weight corresponding to token 'id'.
                for (int j = 0; j < embedding_dim; ++j) {
                    grad_weight.data[id * embedding_dim + j] += grad_output.data[i * embedding_dim + j];
                }
            }
            // Gradients for OOV/padding tokens (id < 0 or id >= num_embeddings) are ignored.
        }
    }

    void zero_grad() {
        grad_weight.zero(); // Zero out all gradients in the embedding matrix.
    }
};

class PositionalEncoding {
public:
    int d_model, max_len;
    Tensor pe; // [max_len, d_model] - Pre-computed positional encodings

    PositionalEncoding(int d_model, int max_len) : d_model(d_model), max_len(max_len), pe({static_cast<size_t>(max_len), static_cast<size_t>(d_model)}) {
        // Compute fixed positional encodings as per "Attention Is All You Need" paper.
        for (int pos = 0; pos < max_len; ++pos) {
            for (int i = 0; i < d_model; ++i) {
                // Calculate the denominator term: 10000^(2i/d_model)
                double div_term = std::pow(10000.0, static_cast<double>(2 * (i / 2)) / d_model);
                // Apply sin for even indices, cos for odd indices.
                if (i % 2 == 0) {
                    pe.data[pos * d_model + i] = std::sin(static_cast<double>(pos) / div_term);
                } else {
                    pe.data[pos * d_model + i] = std::cos(static_cast<double>(pos) / div_term);
                }
            }
        }
    }

    void forward(int seq_len, Tensor& output) { // output: [seq_len, d_model]
        // Simply copy the first 'seq_len' rows from the pre-computed positional encodings.
        assert(seq_len <= max_len);
        std::memcpy(output.data.data(), pe.data.data(), seq_len * d_model * sizeof(double));
    }
};

// --- Full GPT Model (GPT-2/3 style architecture) ---
class GPT {
public:
    int vocab_size, d_model, n_heads, n_layers, max_seq_len;
    Embedding wte; // Token embeddings [vocab_size, d_model]
    PositionalEncoding wpe; // Positional embeddings [max_seq_len, d_model]
    std::vector<std::unique_ptr<DecoderBlock>> h; // Stack of N decoder blocks
    LayerNorm ln_f; // Final layer normalization [d_model]