Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions include/PTO/Transforms/InsertSync/SyncCodegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,19 @@ class SyncCodegen {

// Insert the compiler tail-clean barrier right before function return.
void AppendAutoSyncTailBarrierIfNeeded(IRRewriter &rewriter);

// [Fix #428] Collect back-edge waits at loop heads that need pre-loop
// set_flag initialization to avoid hangs on iteration 0.
void CollectBackEdgeLoopHeadWaits();

// [Fix #428] Emit set_flag ops before a for-loop to initialize event
// registers for back-edge wait_flag ops at the loop head.
void EmitPreLoopEventInit(IRRewriter &rewriter, Operation *op);

// [Fix #428] Emit explicit wait_flag ops before function return to drain
// all pending back-edge event dependencies that pipe_barrier(PIPE_ALL)
// alone does not cover.
void EmitTailEventDrain(IRRewriter &rewriter, func::ReturnOp ret);

void CreateSetWaitOpForSingleBuffer(IRRewriter &rewriter, Operation *op,
SyncOperation *sync, bool beforeInsert);
Expand Down Expand Up @@ -92,6 +105,10 @@ class SyncCodegen {

// Deferred tail-clean barrier requested by sync analysis.
bool pendingAutoSyncTailBarrier_ = false;

// [Fix #428] Map from scf.for Operation* to the back-edge wait ops at
// its loop head that need pre-loop set_flag initialization.
DenseMap<const Operation *, SmallVector<SyncOperation *, 4>> preLoopInitWaits_;
};

} // namespace pto
Expand Down
169 changes: 164 additions & 5 deletions lib/PTO/Transforms/InsertSync/SyncCodegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,25 @@ static void MergeSyncList(SyncOps &dstList, const SyncOps &srcList) {
void SyncCodegen::Run() {
MLIRContext *ctx = func_->getContext();
IRRewriter rewriter(ctx);

UpdateOpInsertSync(rewriter);

// [Optional Debug] 这里的 Debug 打印可以保留或注释掉
// ...


// [Fix #428] Collect back-edge wait ops at loop heads that need pre-loop
// set_flag initialization. A back-edge wait_flag at LOOP_BEGIN fires on
// iteration 0, but the matching set_flag only fires at the end of iteration
// 0. Without a pre-loop set_flag, the event register is uninitialized and
// the hardware hangs.
CollectBackEdgeLoopHeadWaits();

func_->walk<WalkOrder::PreOrder>([&](Operation *op) {
if (op2InsertSync.count(op)) {
// [Fix #428] Before emitting the normal pipeBefore waits for a loop op,
// emit pre-loop set_flag initialization for any back-edge wait_flag that
// would otherwise fire against an uninitialized event register.
if (isa<scf::ForOp>(op)) {
EmitPreLoopEventInit(rewriter, op);
}

// 处理 PRE Sync
for (auto &syncBefore : op2InsertSync[op].pipeBefore) {
SyncInsert(rewriter, op, syncBefore, true);
Expand Down Expand Up @@ -299,6 +310,15 @@ void SyncCodegen::AppendAutoSyncTailBarrierIfNeeded(IRRewriter &rewriter) {

auto pipeAllAttr = getPipeAttr(rewriter, PipelineType::PIPE_ALL);
for (auto ret : returns) {
// [Fix #428] Before the tail barrier, emit explicit wait_flag ops to
// drain all pending back-edge event dependencies. pipe_barrier(PIPE_ALL)
// waits for in-flight pipe operations but does NOT drain event flag
// registers. Without explicit wait_flag calls, stale event state can
// leak to the next kernel invocation.
rewriter.setInsertionPoint(ret);
EmitTailEventDrain(rewriter, ret);

// Re-set insertion point before ret (after any drain ops we just emitted)
rewriter.setInsertionPoint(ret);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This call to setInsertionPoint is redundant. The insertion point is already set to ret (before the return operation) at line 318. Creating operations via the rewriter in EmitTailEventDrain maintains the insertion point before ret, so there is no need to reset it here.

auto barrier = rewriter.create<pto::BarrierOp>(ret.getLoc(), pipeAllAttr);
barrier->setAttr("pto.auto_sync_tail_barrier", rewriter.getUnitAttr());
Expand Down Expand Up @@ -332,6 +352,84 @@ void SyncCodegen::CreateSetWaitOpForSingleBuffer(IRRewriter &rewriter,
rewriter.create<pto::SetFlagOp>(op->getLoc(), srcPipe, dstPipe, eventId);
}
}

// ==============================================================================
// [Fix #428] Tail event drain for back-edge sync events
// ==============================================================================

void SyncCodegen::EmitTailEventDrain(IRRewriter &rewriter,
func::ReturnOp ret) {
// Collect all unique (srcPipe, dstPipe, eventId) triples from back-edge
// syncs across all loops. These events may still be pending when the kernel
// reaches the return statement and must be explicitly drained.
//
// We use a set of tuples to deduplicate — the same event may appear in
// multiple loops or be shared via widen.
struct EventKey {
PipelineType src;
PipelineType dst;
int eventId;
bool operator<(const EventKey &o) const {
if (src != o.src) return static_cast<unsigned>(src) < static_cast<unsigned>(o.src);
if (dst != o.dst) return static_cast<unsigned>(dst) < static_cast<unsigned>(o.dst);
return eventId < o.eventId;
}
Comment on lines +372 to +376
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The comparison operator can be simplified using std::tie for better readability and maintainability. This avoids manual field-by-field comparison logic.

    bool operator<(const EventKey &o) const {
      return std::tie(src, dst, eventId) < std::tie(o.src, o.dst, o.eventId);
    }

bool operator==(const EventKey &o) const {
return src == o.src && dst == o.dst && eventId == o.eventId;
}
};

SmallVector<EventKey> drainEvents;
auto addUnique = [&](PipelineType src, PipelineType dst, int eid) {
EventKey key{src, dst, eid};
for (auto &existing : drainEvents) {
if (existing == key)
return;
}
drainEvents.push_back(key);
};
Comment on lines +383 to +390
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Instead of a manual linear search for uniqueness in a lambda, you can simply collect all events and then use std::unique after the sorting step at line 421. This is more idiomatic and efficient as the number of events grows.

  auto addUnique = [&](PipelineType src, PipelineType dst, int eid) {
    drainEvents.push_back({src, dst, eid});
  };


// Scan all LOOP_END elements for back-edge set/wait pairs with allocated
// event IDs.
for (auto &pair : preLoopInitWaits_) {
for (auto *waitSync : pair.second) {
if (waitSync->uselessSync || waitSync->eventIds.empty())
continue;
addUnique(waitSync->GetActualSrcPipe(), waitSync->GetActualDstPipe(),
waitSync->eventIds[0]);
Comment on lines +395 to +399
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Guard tail event drains by control-flow reachability

This unconditionally adds every collected loop-head event to the tail drain set, so AppendAutoSyncTailBarrierIfNeeded emits wait_flag for those events on all return paths. If a loop is conditionally skipped (e.g., loop inside an scf.if false branch), its pre-loop set_flag is never executed, but the function-tail drain wait still runs and can block on an uninitialized event, reintroducing hangs. The drain list needs to be filtered to events guaranteed to be initialized on the current return path (or emitted under matching control flow).

Useful? React with 👍 / 👎.

}
}

// Also scan the last element's pipeAfter for any set_flag ops that might
// leave events pending (these are the "syncEnd" phantom pairs from
// UpdateBackwardMatchSync that sink to the function tail).
if (!syncIR_.empty()) {
for (auto *sync : syncIR_.back()->pipeAfter) {
if (sync->uselessSync || sync->eventIds.empty())
continue;
if (sync->isSyncWaitType()) {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

There appears to be a logic error here. To "drain" pending events at the function tail, the compiler should identify producer operations (set_flag) that have not been matched by a consumer (wait_flag). If a wait_flag is already present in pipeAfter, the event is already being drained. The comment on line 403 correctly states this should scan for set_flag ops, but the code checks for isSyncWaitType(). This should be changed to isSyncSetType() to correctly identify pending signals that require an explicit drain.

Suggested change
if (sync->isSyncWaitType()) {
if (sync->isSyncSetType()) {

addUnique(sync->GetActualSrcPipe(), sync->GetActualDstPipe(),
sync->eventIds[0]);
}
}
}

if (drainEvents.empty())
return;

// Sort for deterministic output.
llvm::sort(drainEvents);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

If the addUnique logic is simplified to just push_back, you should perform the deduplication here after sorting.

Suggested change
llvm::sort(drainEvents);
llvm::sort(drainEvents);
drainEvents.erase(std::unique(drainEvents.begin(), drainEvents.end()), drainEvents.end());


LLVM_DEBUG(llvm::dbgs() << "[Fix #428] Emitting " << drainEvents.size()
<< " tail event drain wait_flag ops\n");

for (auto &ev : drainEvents) {
auto srcPipe = getPipeAttr(rewriter, ev.src);
auto dstPipe = getPipeAttr(rewriter, ev.dst);
auto eventId = getEventAttr(rewriter, ev.eventId);
rewriter.create<pto::WaitFlagOp>(ret.getLoc(), srcPipe, dstPipe, eventId);
}
}

void SyncCodegen::CreateSetWaitOpForMultiBuffer(IRRewriter &rewriter,
Operation *op,
Expand Down Expand Up @@ -399,3 +497,64 @@ Value SyncCodegen::GetBufferSelected(IRRewriter &rewriter, Operation *op,
SyncIndex2SelectBuffer[sync->GetSyncIndex()] = selected;
return selected;
}

// ==============================================================================
// [Fix #428] Pre-loop event initialization for back-edge sync
// ==============================================================================

void SyncCodegen::CollectBackEdgeLoopHeadWaits() {
for (auto &nowElement : syncIR_) {
auto *loopElement = dyn_cast<LoopInstanceElement>(nowElement.get());
if (!loopElement || loopElement->getLoopKind() != KindOfLoop::LOOP_END)
continue;

// Look at the LOOP_BEGIN node's pipeBefore — these are waits that
// MoveSyncState hoisted to the loop head.
auto *loopBegin =
dyn_cast<LoopInstanceElement>(syncIR_[loopElement->beginId].get());
if (!loopBegin)
continue;

for (auto *sync : loopBegin->pipeBefore) {
if (sync->uselessSync)
continue;
if (!sync->isSyncWaitType())
continue;
if (sync->eventIds.empty())
continue;
Comment on lines +521 to +524
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Restrict pre-loop init to backward-loop waits

This treats all waits hoisted to LOOP_BEGIN as back-edge waits and emits synthetic pre-loop set_flag for each, but MoveSyncState::PlanMoveOutWaitSync hoists any wait whose producer is outside the loop (not only loop-carried/back-edge cases). Initializing those ordinary waits here can satisfy them before the real producer runs and weaken required ordering. Filter candidates with an explicit back-edge marker (for example GetForEndIndex().has_value() / reallocated-loop-head-tail metadata) before adding them to preLoopInitWaits_.

Useful? React with 👍 / 👎.

// This is a wait at loop head with an allocated event ID.
// It needs a pre-loop set_flag to initialize the event register.
// Record {Operation* forOp -> SyncOperation* wait} for later emission.
if (loopElement->elementOp) {
preLoopInitWaits_[loopElement->elementOp].push_back(sync);
}
}
}
}

void SyncCodegen::EmitPreLoopEventInit(IRRewriter &rewriter, Operation *op) {
auto it = preLoopInitWaits_.find(op);
if (it == preLoopInitWaits_.end())
return;

// For each back-edge wait at the loop head, emit a set_flag before the
// for loop to initialize the event register. This ensures that on iteration
// 0, the wait_flag finds a valid (already-set) event instead of hanging.
rewriter.setInsertionPoint(op);
for (auto *waitSync : it->second) {
if (waitSync->uselessSync || waitSync->eventIds.empty())
continue;

auto srcPipe = getPipeAttr(rewriter, waitSync->GetActualSrcPipe());
auto dstPipe = getPipeAttr(rewriter, waitSync->GetActualDstPipe());
auto eventId = getEventAttr(rewriter, waitSync->eventIds[0]);

LLVM_DEBUG(llvm::dbgs()
<< "[Fix #428] Emitting pre-loop set_flag("
<< static_cast<unsigned>(waitSync->GetActualSrcPipe()) << ", "
<< static_cast<unsigned>(waitSync->GetActualDstPipe()) << ", "
<< waitSync->eventIds[0] << ") before loop\n");

rewriter.create<pto::SetFlagOp>(op->getLoc(), srcPipe, dstPipe, eventId);
}
}
Loading