flexflow
diff --git a/‎lib/op-attrs/src/op-attrs/ops/element_unary.cc‎
Lines changed: 0 additions & 1 deletion b/‎lib/op-attrs/src/op-attrs/ops/element_unary.cc‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎lib/op-attrs/test/src/op-attrs/ops/element_unary.cc‎
Lines changed: 0 additions & 8 deletions b/‎lib/op-attrs/test/src/op-attrs/ops/element_unary.cc‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎lib/realm-execution/include/realm-execution/realm_context.h‎
Lines changed: 11 additions & 8 deletions b/‎lib/realm-execution/include/realm-execution/realm_context.h‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎lib/realm-execution/include/realm-execution/sum_reduction.h‎
Lines changed: 0 additions & 99 deletions b/‎lib/realm-execution/include/realm-execution/sum_reduction.h‎
Lines changed: 0 additions & 99 deletions
diff --git a/‎lib/realm-execution/include/realm-execution/tasks/realm_reduction.h‎
Lines changed: 31 additions & 18 deletions b/‎lib/realm-execution/include/realm-execution/tasks/realm_reduction.h‎
Lines changed: 31 additions & 18 deletions
diff --git a/‎lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc‎
Lines changed: 5 additions & 1 deletion b/‎lib/realm-execution/src/realm-execution/distributed_per_device_op_state_initialization.cc‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎lib/realm-execution/src/realm-execution/pcg_instance.cc‎
Lines changed: 54 additions & 0 deletions b/‎lib/realm-execution/src/realm-execution/pcg_instance.cc‎
Lines changed: 54 additions & 0 deletions
@@ -35,7 +35,6 @@ ParallelTensorDimDegrees get_output_parallel_dim_degrees(
     ElementUnaryAttrs const &attrs,
     ParallelTensorDimDegrees const &input_degrees) {
   ASSERT(input_degrees.sum_degree.value == 1);
-  ASSERT(input_degrees.discard_copy_degree.value == 1);
 
   return input_degrees;
 }
 
@@ -62,13 +62,5 @@ TEST_SUITE(FF_TEST_SUITE) {
               SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)));
     }
 
-    SUBCASE("discard copy degree > 1") {
-      positive_int degree = 2_p;
-
-      CHECK_THROWS(get_output_shape(
-          attrs,
-          make_input(
-              SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p)));
-    }
   }
 }
@@ -63,15 +63,18 @@ struct RealmContext {
                             int priority = 0);
   ///\}
 
-  /** \name Data movement */
+  /** \name Data movement and reduction */
   ///\{
-  Realm::Event issue_copy(ParallelTensorShape const &src_shape,
-                          Realm::RegionInstance src_inst,
-                          ParallelTensorShape const &dst_shape,
-                          Realm::RegionInstance dst_inst,
-                          Realm::ProfilingRequestSet const &requests,
-                          Realm::Event wait_on = Realm::Event::NO_EVENT,
-                          int priority = 0);
+  Realm::Event
+      issue_copy(ParallelTensorShape const &src_shape,
+                 Realm::RegionInstance src_inst,
+                 ParallelTensorShape const &dst_shape,
+                 Realm::RegionInstance dst_inst,
+                 Realm::ProfilingRequestSet const &requests,
+                 Realm::Event wait_on = Realm::Event::NO_EVENT,
+                 int priority = 0,
+                 std::optional<Realm::ReductionOpID> redop_id = std::nullopt,
+                 bool exclusive = false);
   ///\}
 
   /** \name Instance management */
 
@@ -1,29 +1,32 @@
 #pragma once
-#include <realm.h>
 #include "op-attrs/datatype.dtg.h"
+#include <realm.h>
 
 namespace FlexFlow {
 
 // Sum reduction for float
 struct SumReductionFloat {
   using LHS = float;
   using RHS = float;
-  static constexpr RHS identity = 0.0f;  // ← inside struct, constexpr
+  static constexpr RHS identity = 0.0f; // ← inside struct, constexpr
 
   template <bool EXCLUSIVE>
   static void apply(LHS &lhs, RHS rhs) {
     if (EXCLUSIVE) {
       lhs += rhs;
     } else {
       // atomic add for non-exclusive
-      __sync_fetch_and_add((int*)&lhs, *(int*)&rhs);  
+      __sync_fetch_and_add((int *)&lhs, *(int *)&rhs);
       // proper float atomic add — use union trick
-      union { float f; int i; } old_val, new_val;
+      union {
+        float f;
+        int i;
+      } old_val, new_val;
       do {
         old_val.f = lhs;
         new_val.f = old_val.f + rhs;
-      } while (!__sync_bool_compare_and_swap(
-          (int*)&lhs, old_val.i, new_val.i));
+      } while (
+          !__sync_bool_compare_and_swap((int *)&lhs, old_val.i, new_val.i));
     }
   }
 
@@ -32,34 +35,39 @@ struct SumReductionFloat {
     if (EXCLUSIVE) {
       rhs1 += rhs2;
     } else {
-      union { float f; int i; } old_val, new_val;
+      union {
+        float f;
+        int i;
+      } old_val, new_val;
       do {
         old_val.f = rhs1;
         new_val.f = old_val.f + rhs2;
-      } while (!__sync_bool_compare_and_swap(
-          (int*)&rhs1, old_val.i, new_val.i));
+      } while (
+          !__sync_bool_compare_and_swap((int *)&rhs1, old_val.i, new_val.i));
     }
   }
 };
 
-
 // Sum reduction for double
 struct SumReductionDouble {
   using LHS = double;
   using RHS = double;
-  static constexpr RHS identity = 0.0;  // ← inside struct, constexpr  
+  static constexpr RHS identity = 0.0; // ← inside struct, constexpr
 
   template <bool EXCLUSIVE>
   static void apply(LHS &lhs, RHS rhs) {
     if (EXCLUSIVE) {
       lhs += rhs;
     } else {
-      union { double d; long long i; } old_val, new_val;
+      union {
+        double d;
+        long long i;
+      } old_val, new_val;
       do {
         old_val.d = lhs;
         new_val.d = old_val.d + rhs;
       } while (!__sync_bool_compare_and_swap(
-          (long long*)&lhs, old_val.i, new_val.i));
+          (long long *)&lhs, old_val.i, new_val.i));
     }
   }
 
@@ -68,26 +76,31 @@ struct SumReductionDouble {
     if (EXCLUSIVE) {
       rhs1 += rhs2;
     } else {
-      union { double d; long long i; } old_val, new_val;
+      union {
+        double d;
+        long long i;
+      } old_val, new_val;
       do {
         old_val.d = rhs1;
         new_val.d = old_val.d + rhs2;
       } while (!__sync_bool_compare_and_swap(
-          (long long*)&rhs1, old_val.i, new_val.i));
+          (long long *)&rhs1, old_val.i, new_val.i));
     }
   }
 };
 
 // Reduction op IDs — must not conflict with other registered redops
 enum SumReductionOpIDs {
-  REDOP_SUM_FLOAT  = 1,
+  REDOP_SUM_FLOAT = 1,
   REDOP_SUM_DOUBLE = 2,
 };
 
 inline Realm::ReductionOpID get_sum_reduction_op_id(DataType dtype) {
   switch (dtype) {
-    case DataType::FLOAT:  return REDOP_SUM_FLOAT;
-    case DataType::DOUBLE: return REDOP_SUM_DOUBLE;
+    case DataType::FLOAT:
+      return REDOP_SUM_FLOAT;
+    case DataType::DOUBLE:
+      return REDOP_SUM_DOUBLE;
     default:
       PANIC("no sum reduction registered for datatype {}", dtype);
   }
 
@@ -31,6 +31,7 @@ PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
   std::unordered_map<DynamicNodeInvocation,
                      DeviceSpecificPtr<PerDeviceOpState> *>
       device_state_map;
+  std::vector<Realm::Event> completion_events;
   for (DynamicNodeInvocation const &invocation : dg.invocations) {
     Realm::Processor target_proc = ctx.map_device_coord_to_processor(
         assert_unwrap(invocation.node_attrs.device_coord));
@@ -56,14 +57,17 @@ PerDeviceOpStateBacking perform_distributed_per_device_op_state_initialization(
                                             precondition);
 
     if (completion_event.has_value()) {
+      completion_events.push_back(completion_event.value());
       device_state_map.insert(std::pair{invocation, device_state_ptr});
     } else {
       // Task doesn't require initialization, clean up and don't store result
       delete device_state_ptr;
     }
   }
 
-  ctx.get_outstanding_events().wait();
+  // wait for all init tasks — direct write to *result_ptr happens
+  // before each init task event fires so result is ready after this
+  Realm::Event::merge_events(completion_events).wait();
 
   auto deref = [](DeviceSpecificPtr<PerDeviceOpState> *const &p) { return *p; };
   std::unordered_map<DynamicNodeInvocation, DeviceSpecificPtr<PerDeviceOpState>>
 
@@ -6,6 +6,7 @@
 #include "realm-execution/instance_allocation.h"
 #include "realm-execution/realm_context.h"
 #include "realm-execution/tasks/impl/op_task.h"
+#include "realm-execution/tasks/realm_reduction.h"
 #include "realm-execution/tensor_instance_backing.h"
 #include "task-spec/dynamic_graph/copy_insertion.h"
 #include "task-spec/dynamic_graph/dynamic_node_invocation.dtg.h"
@@ -215,18 +216,71 @@ static Realm::Event spawn_dynamic_node_invocation(
                           precondition);
   };
 
+  // issue_replicate_bwd lambda
+  auto issue_replicate_bwd = [&]() {
+    std::optional<DynamicValueAttrs> output_grad_opt;
+    for (auto const &[slot, value] : invocation.inputs) {
+      if (slot.slot_tensor_role == DynamicTensorRole{FwbTensorType::GRADIENT}) {
+        output_grad_opt = value;
+      }
+    }
+    DynamicValueAttrs output_grad = assert_unwrap(output_grad_opt);
+    DynamicValueAttrs input_grad = get_only(invocation.outputs).second;
+    Realm::RegionInstance dst_inst =
+        tensor_instance_backing.backing.at(input_grad).first;
+
+    Realm::ReductionOpID redop_id = get_sum_reduction_op_id(
+        assert_unwrap(output_grad.parallel_tensor_shape).data_type);
+
+    // chain reductions sequentially to avoid write races on dst
+    Realm::Event e = precondition;
+    for (auto const &[p, m] : assert_unwrap(output_grad.mapping)) {
+      DynamicValueAttrs replica_key = output_grad;
+      replica_key.mapping =
+          bidict<ParallelTensorSpaceCoordinate, MachineSpaceCoordinate>{{p, m}};
+      replica_key.shard_coord = p;
+
+      Realm::RegionInstance src_inst =
+          tensor_instance_backing.backing.at(replica_key).first;
+
+      e = ctx.issue_copy(assert_unwrap(output_grad.parallel_tensor_shape),
+                         src_inst,
+                         assert_unwrap(input_grad.parallel_tensor_shape),
+                         dst_inst,
+                         Realm::ProfilingRequestSet{},
+                         e,
+                         0,
+                         redop_id,
+                         false);
+    }
+    return e;
+  };
+
   TrainingOperationAttrs op_attrs =
       assert_unwrap(invocation.node_attrs.op_attrs);
   return op_attrs.visit<Realm::Event>(overload{
       [&](PCGOperatorAttrs const &pcg_op_attrs) {
         return pcg_op_attrs.visit<Realm::Event>(overload{
             [&](InputAttrs const &) { return Realm::Event::NO_EVENT; },
             [&](WeightAttrs const &) { return Realm::Event::NO_EVENT; },
+            [&](ReplicateAttrs const &) {
+              // this should never be reached since replicate
+              // goes through TrainingOperationAttrs::ReplicateAttrs
+              PANIC("unexpected replicate in PCGOperatorAttrs path");
+              return Realm::Event::NO_EVENT;
+            },
             [&](auto const &) { return spawn_task(); },
         });
       },
       [&](LossAttrs const &) { return spawn_task(); },
       [&](CopyAttrs const &) { return issue_copy(); },
+      [&](ReplicateAttrs const &) {
+        if (invocation.node_attrs.task_type.has_value() &&
+            invocation.node_attrs.task_type.value() == DynamicTaskType::BWD) {
+          return issue_replicate_bwd();
+        }
+        return issue_copy();
+      },
   });
 }
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,6 @@ ParallelTensorDimDegrees get_output_parallel_dim_degrees(`
`35`	`35`	`ElementUnaryAttrs const &attrs,`
`36`	`36`	`ParallelTensorDimDegrees const &input_degrees) {`
`37`	`37`	`ASSERT(input_degrees.sum_degree.value == 1);`
`38`		`- ASSERT(input_degrees.discard_copy_degree.value == 1);`
`39`	`38`
`40`	`39`	`return input_degrees;`
`41`	`40`	`}`
Original file line number	Diff line number	Diff line change
`@@ -62,13 +62,5 @@ TEST_SUITE(FF_TEST_SUITE) {`
`62`	`62`	`SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)));`
`63`	`63`	`}`
`64`	`64`
`65`		`- SUBCASE("discard copy degree > 1") {`
`66`		`- positive_int degree = 2_p;`
`67`		`-`
`68`		`- CHECK_THROWS(get_output_shape(`
`69`		`- attrs,`
`70`		`- make_input(`
`71`		`- SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p)));`
`72`		`- }`
`73`	`65`	`}`
`74`	`66`	`}`