Skip to content

Commit 06b3f12

Browse files
nindanaotoclaude
andcommitted
Fix BK bus desync: atomic TREADY in AXISBRLater
Root cause of real-FPGA hang (hw_emu unaffected): AXISBRLater feeds bootstrapping key (BK) data to ExternalProductMiddle via 4 AXI4-Stream register slices per polynomial half. Previously each slice received TREADY := trgswinready(k) independently, while trgswinvalid(k) = Cat(tvalidvec).andR (requires ALL 4 valid). When one HBM bus momentarily stalled (TVALID=0), the other 3 slices saw TREADY=1 and consumed their data, but TRGSWBatchMemory saw wen=0 and discarded it. This silently corrupted the BK accumulation, causing MULandACC to miss outflag events, finreg to stall short of its target, and BlindRotate to hang in FINWAIT forever. Fix: require all 4 buses to be simultaneously valid before any slice advances (TREADY := trgswinready(k) && allValid). This is correct because HomGate dispatches all 4 buses in lock-step; the depth-8 slices absorb the small SLR-crossing skew. Why hw_emu was unaffected: simulated HBM delivers all buses in lock-step every cycle, so the AND was always 1 and the bug never fired. Also includes diagnostic infrastructure added during investigation: - BRBack_top.v: axis18 debug stream packs 4×8-bit saturating counters (feedback ch0/ch1, BK ch0/ch4 beat counts) for live hang diagnosis - HomGate_top.v + HomGate_control_s_axi.v: 4th debug register at AXI-lite 0x20c captures the BRBack axis18 debug word - xrt_regread.cpp: decode and print the new brback_dbg register with expected reference values in the 5-second polling output - build_brback_xo.tcl: copy RTL sources into ip_repo/src to work around ipx::package_project file-copy reliability issue Result: gate 0 = 1.32ms, gate 1 = 1.25ms, PASS on Alveo U280 @ 300MHz Timing: WNS = 0.016ns, all constraints met, zero violations. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent bbb7381 commit 06b3f12

6 files changed

Lines changed: 67 additions & 10 deletions

File tree

chisel/HomGate/src/main/scala/BlindRotate.scala

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -369,14 +369,22 @@ class AXISBRLater(implicit val conf:Config) extends Module{
369369
for(k <- 0 until conf.k+1){
370370
val tvalidvec = Wire(Vec(conf.bknumbus/2,Bool()))
371371
val tdatavec = Wire(Vec(conf.bknumbus/2,UInt(conf.buswidth.W)))
372-
for(i <- 0 until conf.bknumbus/2){
372+
val allValid = Cat(tvalidvec).andR
373+
val slices = for(i <- 0 until conf.bknumbus/2) yield {
373374
val slice = Module(new AXI4StreamRegisterSlice(conf.buswidth,conf.axi4snumslice))
374375
slice.io.subordinate <> io.axi4bkin(k*conf.bknumbus/2+i)
375-
slice.io.manager.TREADY := extpmiddle.io.trgswinready(k)
376376
tvalidvec(i) := slice.io.manager.TVALID
377377
tdatavec(i) := slice.io.manager.TDATA
378+
slice
379+
}
380+
// Slices must advance atomically: only when ALL 4 buses are simultaneously valid
381+
// AND TRGSWBatchMemory is ready to accept. Without this, a momentary stall on any
382+
// one HBM bus causes the other 3 slices to consume their data without writing it to
383+
// TRGSWBatchMemory (silently discarding those BK beats → corrupted accumulation).
384+
for(i <- 0 until conf.bknumbus/2){
385+
slices(i).io.manager.TREADY := extpmiddle.io.trgswinready(k) && allValid
378386
}
379-
extpmiddle.io.trgswinvalid(k) := Cat(tvalidvec).andR
387+
extpmiddle.io.trgswinvalid(k) := allValid
380388
extpmiddle.io.trgswin(k) := Cat(tdatavec.reverse)
381389
}
382390

vitis/rtl/BRBack_top.v

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -338,9 +338,35 @@ module BRBack #(
338338

339339
assign axis16_tkeep = {4{1'b1}};
340340

341-
// axis18: tied off (unused)
342-
assign axis18_tvalid = 1'b0;
343-
assign axis18_tdata = 32'b0;
341+
// axis18: debug counters packed into 32 bits (→ HomGate axis10 → bkin0_beats register)
342+
// Each byte = bits[16:9] of a 17-bit beat counter (512 beats/unit):
343+
// [31:24] = axis04_tvalid >>9 (feedback ch0, max 81408→0x9F; 3 short→0x9E)
344+
// [23:16] = axis05_tvalid >>9 (feedback ch1, same expected value)
345+
// [15: 8] = axis06_tvalid >>9 (BK ch0 from HomGate, max 122112→0xEE; stalled→less)
346+
// [ 7: 0] = axis10_tvalid >>9 (BK ch4 from HomGate, same expected value as ch0)
347+
reg [16:0] dbg_fb0_cnt = 17'b0;
348+
reg [16:0] dbg_fb1_cnt = 17'b0;
349+
reg [16:0] dbg_bk0_cnt = 17'b0;
350+
reg [16:0] dbg_bk4_cnt = 17'b0;
351+
always @(posedge ap_clk) begin
352+
if (areset) begin
353+
dbg_fb0_cnt <= 17'b0;
354+
dbg_fb1_cnt <= 17'b0;
355+
dbg_bk0_cnt <= 17'b0;
356+
dbg_bk4_cnt <= 17'b0;
357+
end else begin
358+
if (axis04_tvalid && dbg_fb0_cnt != 17'h1FFFF)
359+
dbg_fb0_cnt <= dbg_fb0_cnt + 1;
360+
if (axis05_tvalid && dbg_fb1_cnt != 17'h1FFFF)
361+
dbg_fb1_cnt <= dbg_fb1_cnt + 1;
362+
if (axis06_tvalid && dbg_bk0_cnt != 17'h1FFFF)
363+
dbg_bk0_cnt <= dbg_bk0_cnt + 1;
364+
if (axis10_tvalid && dbg_bk4_cnt != 17'h1FFFF)
365+
dbg_bk4_cnt <= dbg_bk4_cnt + 1;
366+
end
367+
end
368+
assign axis18_tvalid = 1'b1;
369+
assign axis18_tdata = {dbg_fb0_cnt[16:9], dbg_fb1_cnt[16:9], dbg_bk0_cnt[16:9], dbg_bk4_cnt[16:9]};
344370
assign axis18_tkeep = {4{1'b1}};
345371
assign axis18_tlast = 1'b0;
346372

vitis/rtl/HomGate_control_s_axi.v

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ module HomGate_control_s_axi #(
5858
// Debug read-only registers
5959
input wire [31:0] dbg_reg0,
6060
input wire [31:0] dbg_reg1,
61-
input wire [31:0] dbg_reg2
61+
input wire [31:0] dbg_reg2,
62+
input wire [31:0] dbg_reg3
6263
);
6364

6465
//------------------------Address Info-------------------
@@ -148,6 +149,7 @@ module HomGate_control_s_axi #(
148149
ADDR_DBG_REG0 = 12'h200, // AXISIKS output beat count
149150
ADDR_DBG_REG1 = 12'h204, // axis01 input beat count (SEI→HomGate)
150151
ADDR_DBG_REG2 = 12'h208, // misc status bits
152+
ADDR_DBG_REG3 = 12'h20c, // BK bus0 (axis02) beat count
151153
WRIDLE = 2'd0,
152154
WRDATA = 2'd1,
153155
WRRESP = 2'd2,
@@ -363,6 +365,7 @@ module HomGate_control_s_axi #(
363365
ADDR_DBG_REG0: rdata <= dbg_reg0;
364366
ADDR_DBG_REG1: rdata <= dbg_reg1;
365367
ADDR_DBG_REG2: rdata <= dbg_reg2;
368+
ADDR_DBG_REG3: rdata <= dbg_reg3;
366369
default: rdata <= 32'b0;
367370
endcase
368371
end

vitis/rtl/HomGate_top.v

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -676,7 +676,8 @@ inst_control_s_axi (
676676
.axi20_ptr0 ( axi20_ptr0 ),
677677
.dbg_reg0 ( dbg_iksout_cnt ),
678678
.dbg_reg1 ( dbg_axis01_cnt ),
679-
.dbg_reg2 ( dbg_misc )
679+
.dbg_reg2 ( dbg_misc ),
680+
.dbg_reg3 ( dbg_bkin0_cnt )
680681
);
681682

682683
///////////////////////////////////////////////////////////////////////////////
@@ -728,30 +729,36 @@ wire axi4sout_TLAST;
728729
wire dm00_s2mm_err;
729730
wire dm00_s2mm_sts_tvalid;
730731

731-
// Debug beat counters (read via AXI-lite at 0x200/0x204/0x208)
732+
// Debug beat counters (read via AXI-lite at 0x200/0x204/0x208/0x20c)
732733
reg [31:0] dbg_iksout_cnt = 32'b0; // AXISIKS output beats accepted
733734
reg [31:0] dbg_axis01_cnt = 32'b0; // axis01 beats entering HomGate
734735
reg [31:0] dbg_misc = 32'b0; // latched/live status bits
736+
reg [31:0] dbg_bkin0_cnt = 32'b0; // NTT beat count from BRBack (via axis18→axis10)
735737

736738
always @(posedge ap_clk) begin
737739
if (areset) begin
738740
dbg_iksout_cnt <= 32'b0;
739741
dbg_axis01_cnt <= 32'b0;
740742
dbg_misc <= 32'b0;
743+
dbg_bkin0_cnt <= 32'b0;
741744
end else begin
742745
// Count AXISIKS output accepted beats (expect 40 total for 2 batches)
743746
if (axi4siksout_TVALID && axi4siksout_TREADY && dbg_iksout_cnt != 32'hFFFFFFFF)
744747
dbg_iksout_cnt <= dbg_iksout_cnt + 1;
745748
// Count axis01 input beats from BRBack (expect 2050 per gate)
746749
if (axis01_tvalid && axis01_tready && dbg_axis01_cnt != 32'hFFFFFFFF)
747750
dbg_axis01_cnt <= dbg_axis01_cnt + 1;
751+
// Capture BRBack NTT-beat counter from axis10 (debug stream)
752+
if (axis10_tvalid)
753+
dbg_bkin0_cnt <= axis10_tdata;
748754
// Latch misc status bits (sticky once set)
749755
if (axi4sout_TLAST) dbg_misc[0] <= 1'b1; // axi4sout TLAST seen
750756
if (dm00_s2mm_err) dbg_misc[1] <= 1'b1; // S2MM error
751757
if (axi4outcmd_TVALID) dbg_misc[2] <= 1'b1; // S2MM command issued
752758
if (dm00_s2mm_sts_tvalid) dbg_misc[3] <= 1'b1; // S2MM completed
753759
if (axi4siksout_TVALID) dbg_misc[4] <= 1'b1; // IKS ever had output valid
754760
if (axis01_tvalid) dbg_misc[5] <= 1'b1; // axis01 ever had data
761+
if (axi4bkin_0_TVALID) dbg_misc[6] <= 1'b1; // BK bus0 ever had data
755762
// Live (instantaneous) signals in bits 15:8
756763
dbg_misc[8] <= axi4siksout_TVALID;
757764
dbg_misc[9] <= axi4siksout_TREADY;

vitis/scripts/build_brback_xo.tcl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ for {set i 0} {$i <= 18} {incr i} {
4040
ipx::associate_bus_interfaces -busif [format "axis%02d" $i] -clock ap_clk [ipx::current_core]
4141
}
4242

43+
# Workaround: ipx::package_project -import_files sometimes fails to copy RTL files.
44+
# Manually copy sources to ip_repo/src so package_xo includes them.
45+
file mkdir ${ip_repo_dir}/src
46+
file copy -force ${rtl_dir}/BRBack_top.v ${ip_repo_dir}/src/BRBack_top.v
47+
file copy -force ${rtl_dir}/HomGateWrap.v ${ip_repo_dir}/src/HomGateWrap.v
48+
4349
# Save IP
4450
ipx::create_xgui_files [ipx::current_core]
4551
ipx::update_checksums [ipx::current_core]

xcltest/HomGate/xrt_regread.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,12 +154,13 @@ int fpga_wait_kernel(int run_idx) {
154154
break;
155155
}
156156
// Print AP control register and debug register diagnostics
157-
uint32_t ap_ctrl = 0xDEAD, dbg0 = 0xDEAD, dbg1 = 0xDEAD, dbg2 = 0xDEAD;
157+
uint32_t ap_ctrl = 0xDEAD, dbg0 = 0xDEAD, dbg1 = 0xDEAD, dbg2 = 0xDEAD, dbg3 = 0xDEAD;
158158
if (s_kern) {
159159
try { ap_ctrl = s_kern->read_register(0x000); } catch (...) {}
160160
try { dbg0 = s_kern->read_register(0x200); } catch (...) {}
161161
try { dbg1 = s_kern->read_register(0x204); } catch (...) {}
162162
try { dbg2 = s_kern->read_register(0x208); } catch (...) {}
163+
try { dbg3 = s_kern->read_register(0x20c); } catch (...) {}
163164
}
164165
std::cout << "fpga_wait_kernel: " << elapsed << "s elapsed, ap_ctrl=0x"
165166
<< std::hex << ap_ctrl << std::dec
@@ -170,6 +171,12 @@ int fpga_wait_kernel(int run_idx) {
170171
<< " iksout_beats=" << dbg0
171172
<< " axis01_beats=" << dbg1
172173
<< " misc=0x" << std::hex << dbg2 << std::dec
174+
<< " brback_dbg=0x" << std::hex << dbg3
175+
<< "(fb0=" << ((dbg3>>24)&0xFF) << "/0x9F"
176+
<< " fb1=" << ((dbg3>>16)&0xFF) << "/0x9F"
177+
<< " bk0=" << ((dbg3>>8)&0xFF) << "/0xEE"
178+
<< " bk4=" << ((dbg3>>0)&0xFF) << "/0xEE)"
179+
<< std::dec
173180
<< std::endl;
174181
if (elapsed >= timeout_s) {
175182
std::cerr << "fpga_wait_kernel: TIMEOUT after " << elapsed << "s" << std::endl;

0 commit comments

Comments
 (0)