[tmva][sofie] Improve AD-friendlieness of emitted code for Clad

guitargeek · guitargeek · commit b7655c77d33d · 2026-04-14T11:30:18.000+02:00
This commit refactors SOFIE-generated inference code to enable correct and
efficient reverse-mode automatic differentiation with Clad.

Key changes:

* Introduce explicit primitive operations (`Copy`, `Fill`, `Relu`) in
  SOFIE_common.hxx and provide corresponding custom pullbacks in
  CladDerivator.h. This replaces previously inlined loops and allows
  Clad to generate efficient gradient code without relying on tapes or
  loop-level differentiation.

* Update Gemm code generation to emit Copy/Fill instead of manually
  expanding bias initialization loops. This better exposes the intent
  and improves AD performance and correctness.

* Replace manual ReLU loops with a dedicated Relu() call, enabling a
  custom pullback that avoids tape-based condition tracking.

* Generate an additional "unoptimized" model variant in the SOFIE test suite
  (`OptimizationLevel::kBasic`), and use it for AD tests. This disables
  memory reuse of intermediate tensors. Opaque memory reuse is safe for
  inference but breaks source-transformation AD.

* Improve gradient test diagnostics in SOFIE Clad tests by reporting
  mismatched indices instead of only checking a global max difference.

With these changes, Clad-generated gradients for SOFIE models are both correct
and significantly faster, reaching performance comparable to frameworks such
as PyTorch and JAX on the CPU for the tested cases (fully-connected
neural networks with multiple layers).
diff --git a/math/mathcore/inc/Math/CladDerivator.h b/math/mathcore/inc/Math/CladDerivator.h
@@ -1169,6 +1169,35 @@ inline void Gemm_Call_pullback(float *output, bool transa, bool transb, int m, i
    }
 }
 
+inline void Copy_pullback(float *output, const float *input, int size, float *_d_output, float *_d_input, int *)
+{
+   for (int i = 0; i < size; i++) {
+      output[i] = input[i];
+      _d_input[i] += _d_output[i];
+      _d_output[i] = 0.F;
+   }
+}
+
+inline void Fill_pullback(float *output, float value, int size, float *_d_output, float *_d_value, int *)
+{
+   for (int i = 0; i < size; i++) {
+      output[i] = value;
+      *_d_value += _d_output[i];
+      _d_output[i] = 0.F;
+   }
+}
+
+inline void Relu_pullback(float *output, const float *input, int size, float *_d_output, float *_d_input, int *)
+{
+   for (int i = 0; i < size; i++) {
+      output[i] = input[i] > 0.F ? input[i] : 0.F;
+      float _r_d0 = _d_output[i];
+      _d_output[i] = 0.F;
+      if (input[i] > 0.F)
+         _d_input[i] += _r_d0;
+   }
+}
+
 } // namespace TMVA::Experimental::SOFIE
 
 } // namespace clad::custom_derivatives
diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
@@ -397,29 +397,27 @@ namespace SOFIE{
             else
                out << "j;\n";
 
-            out << SP2 << SP << "for (size_t k = 0; k < " << sY[1] << "; k++) { \n";
-            std::string bias_index;
-            if (sC.size() != 2)
+            std::string prefix = SP2 + SP + "TMVA::Experimental::SOFIE::";
+            std::string target = "tensor_" + fNY;
+            if (sC.size() != 2) {
                throw std::runtime_error("TMVA SOFIE Gemm Op - invalid rank for bias tensor " + ConvertDimShapeToString(fDimShapeC) + ConvertDimShapeToString(sC));
-            if (sC[0].GetVal() == "1" && sC[1].GetVal() == sY[1].GetVal())
-               bias_index = "k";
-            else if (sC[1].GetVal() == "1" && sC[0].GetVal() == sY[0].GetVal())
-               bias_index = "j";
-            else if (sC[0].GetVal() == "1" && sC[1].GetVal() == "1")   // scalar case
-               bias_index = "0";
-            else {
+            } if (sC[0].GetVal() == "1" && sC[1].GetVal() == sY[1].GetVal()) {
+               out << prefix << "Copy(" << target << " + y_index, tensor_" << fNC << ", " << sY[1] << ");\n";
+            } else if (sC[1].GetVal() == "1" && sC[0].GetVal() == sY[0].GetVal()) {
+               out << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[j], " << sY[1] << ");\n";
+            } else if (sC[0].GetVal() == "1" && sC[1].GetVal() == "1") {
+               // scalar case
+               out << prefix << "Fill(" << target << " + y_index, tensor_" << fNC << "[0], " << sY[1] << ");\n";
+            } else {
                throw std::runtime_error("TMVA SOFIE Gemm Op - invalid shape for bias tensor " + ConvertDimShapeToString(fDimShapeC));
             }
 
-            out << SP2 << SP << SP << "tensor_" << fNY << "[y_index + k] = " <<  "tensor_" << fNC << "[" << bias_index << "];\n";
-            out << SP2 << SP << "}\n";
             out << SP2 << "}\n";
          }
 
          if (fType == "float"){
 
-            out << SP2 << "TMVA::Experimental::SOFIE::Gemm_Call("
-             << "tensor_" << fNY;
+            out << SP2 << "TMVA::Experimental::SOFIE::Gemm_Call(" << "tensor_" << fNY;
              if (doStackMul) out << " + " << opName << "_y_offset";
             out <<   ", "
              << (fAttrTransB ? "true, " : "false, ")
@@ -461,15 +459,15 @@ namespace SOFIE{
          // fuse with Relu
          if(fActivation == EActivationType::RELU){
                out << SP << "//--- applying RELU to output\n";
-               out << SP << "for (int id = 0; id < " << ConvertDimShapeToLength(fShapeY) << " ; id++){\n";
-               out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNY << "[id] > 0 )? tensor_" << fNY << "[id] : 0);\n";
-               out << SP << "}\n";
+               std::string tnsr = "tensor_" + fNY;
+               std::string reluSize = ConvertDimShapeToLength(fShapeY);
+               out << SP << "TMVA::Experimental::SOFIE::Relu(" << tnsr << ", " << tnsr << ", " << reluSize << ");\n";
          }
 
          return out.str();
       }
 
-      std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; }
+      std::vector<std::string> GetBlasRoutines() override { return {"Gemm", "Gemv"}; }
 
    };
 
diff --git a/tmva/sofie/inc/TMVA/SOFIE_common.hxx b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
@@ -772,6 +772,23 @@ inline void Gemm_Call(float *output, bool transa, bool transb, int m, int n, int
                                            &beta, output, ldc);
 }
 
+inline void Fill(float *output, float value, int size)
+{
+   std::fill(output, output + size, value);
+}
+
+inline void Copy(float *output, float const *input, int size)
+{
+   std::copy(input, input + size, output);
+}
+
+inline void Relu(float *output, float const *input, int size)
+{
+   for (int i = 0; i < size; i++) {
+      output[i] = (input[i] > 0.0f) ? input[i] : 0.0f;
+   }
+}
+
 template <class T>
 void ReadTensorFromStream(std::istream &is, T &target, std::string const &expectedName, std::size_t expectedLength)
 {
diff --git a/tmva/sofie/test/EmitFromONNX.cxx.in b/tmva/sofie/test/EmitFromONNX.cxx.in
@@ -9,22 +9,52 @@
 #include "TMVA/RModel.hxx"
 #include "TMVA/RModelParser_ONNX.hxx"
 
-using namespace TMVA::Experimental::SOFIE;
-
-int EmitModel(std::string filename, std::string outname) {
+int EmitModel(std::string filename, std::string outname)
+{
+   using namespace TMVA::Experimental::SOFIE;
 
    std::cout << "parsing file ..." << filename << std::endl;
-   RModelParser_ONNX parser;
-   RModel model = parser.Parse(filename);
-   model.Generate();
-   model.OutputGenerated(outname+"_FromONNX.hxx");
+   {
+      // The generated code with all optimizations. Used for most SOFIE tests.
+      RModelParser_ONNX parser;
+      RModel model = parser.Parse(filename);
+      model.Generate();
+      model.OutputGenerated(outname + "_FromONNX.hxx");
+   }
+   {
+      // Generate code without memory re-use for intermediate tensors.
+      //
+      // IMPORTANT:
+      // When memory re-use is enabled, SOFIE may assign multiple intermediate
+      // tensors to the same memory buffer. This means that values produced earlier
+      // in the forward pass can be overwritten by later operations.
+      //
+      // This is safe for inference, but it breaks source-transformation automatic
+      // differentiation (e.g. with Clad). In reverse-mode AD, the backward pass
+      // needs access to the original intermediate values from the forward pass
+      // (e.g. inputs to activations like ReLU). If those values have been
+      // overwritten, the generated gradient code will read incorrect data and
+      // produce wrong results.
+      //
+      // Since Clad operates on the generated source code and is not aware of these
+      // aliasing/reuse optimizations, it cannot reconstruct or recompute the lost
+      // values. Therefore we disable memory re-use here to ensure correctness of
+      // the differentiated code.
+      //
+      // Note: this increases memory usage but is required for AD correctness.
+      RModelParser_ONNX parser;
+      RModel model = parser.Parse(filename);
+      model.SetOptimizationLevel(OptimizationLevel::kBasic);
+      model.Generate();
+      model.OutputGenerated(outname + "_FromONNX_unoptimized.hxx");
+   }
 
    return 0;
 }
 
-int main(int argc, char *argv[]){
-
-@EMIT_CAPTURES@ ;
-
+int main(int argc, char *argv[])
+{
+   // clang-format off
+   @EMIT_CAPTURES@;
+   // clang-format on
 }
-
diff --git a/tmva/sofie/test/TestCladAutodiff.cxx b/tmva/sofie/test/TestCladAutodiff.cxx
@@ -1,5 +1,5 @@
-constexpr auto modelHeaderSuffix = "_FromONNX.hxx";
-constexpr auto modelDataSuffix = "_FromONNX.dat";
+constexpr auto modelHeaderSuffix = "_FromONNX_unoptimized.hxx";
+constexpr auto modelDataSuffix = "_FromONNX_unoptimized.dat";
 #include "test_helpers.h"
 
 #include "input_models/references/Linear_16.ref.hxx"
@@ -76,28 +76,46 @@ float Linear_16_wrapper_num_diff(TMVA_SOFIE_Linear_16::Session const &session, f
          .c_str());
 
    // If you want to see the gradient code:
-   // gInterpreter->ProcessLine("static_cast<void (*)(TMVA_SOFIE_Linear_16::Session const &, float const *, float
-   // *)>(Linear_16_outer_wrapper_grad_1)"); gInterpreter->ProcessLine("Linear_16_wrapper_pullback");
+   // clang-format off
+   // gInterpreter->ProcessLine("static_cast<void (*)(TMVA_SOFIE_Linear_16::Session const &, float const *, float *)>(Linear_16_outer_wrapper_grad_1)");
+   // gInterpreter->ProcessLine("Linear_16_wrapper_pullback");
    // gInterpreter->ProcessLine("TMVA_SOFIE_Linear_16::doInfer_reverse_forw");
    // gInterpreter->ProcessLine("TMVA_SOFIE_Linear_16::doInfer_pullback");
+   // clang-format on
 
-   auto retVal = gInterpreter->ProcessLine((R"(
-   double maxDiff = 0;
+   gInterpreter->ProcessLine((R"(
+   float numeric_output[1600]{};
    for (std::size_t i = 0; i < std::size(grad_output); ++i) {
-      double val = grad_output[i];
-      double ref = Linear_16_wrapper_num_diff(session_linear_16, )" +
-                                            inputInterp + R"(, i);
-      if (val != ref) {
-         maxDiff = std::max(std::abs(val - ref), maxDiff);
-      }
+      numeric_output[i] = Linear_16_wrapper_num_diff(session_linear_16, )" +
+                              inputInterp + R"(, i);
    }
-   double tol = 0.0025;
-   // the "return" value
-   (maxDiff < tol);
    )")
-                                              .c_str());
+                                .c_str());
+
+   double tol = 0.0025;
+
+   auto arr_size = static_cast<std::size_t>(gInterpreter->ProcessLine("std::size(grad_output);"));
+   auto grad_arr = reinterpret_cast<float *>(gInterpreter->ProcessLine("grad_output;"));
+   auto numeric_arr = reinterpret_cast<float *>(gInterpreter->ProcessLine("numeric_output;"));
+
+   constexpr std::size_t kMaxPrint = 10;
+   std::size_t mismatchCount = 0;
 
-   EXPECT_EQ(retVal, 1) << "The gradient from Clad and the numeric gradient didn't match within tolerance.";
+   for (std::size_t i = 0; i < arr_size; ++i) {
+      double diff = std::abs(grad_arr[i] - numeric_arr[i]);
+
+      if (diff > tol) {
+         if (mismatchCount < kMaxPrint) {
+            ADD_FAILURE() << "Mismatch at index " << i << " analytic=" << grad_arr[i] << " numeric=" << numeric_arr[i]
+                          << " diff=" << diff;
+         }
+         ++mismatchCount;
+      }
+   }
+
+   if (mismatchCount > kMaxPrint) {
+      ADD_FAILURE() << "Further mismatches suppressed (total mismatches: " << mismatchCount << ")";
+   }
 
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(Linear_16_ExpectedOutput::all_ones) / sizeof(float));