diff --git a/build/t27c/gf16_add.v b/build/t27c/gf16_add.v new file mode 100644 index 0000000..f452779 --- /dev/null +++ b/build/t27c/gf16_add.v @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Auto-generated reference twin for EQY gate · phi^2 + phi^-2 = 3 · DO NOT EDIT +// +// Module : gf16_add +// Origin : build/t27c/gf16_add.v — t27c structural twin +// Anchor : phi^2 + phi^-2 = 3 · Wave-24 RVR-018 · EPIC #61 W15-TT-E +// DOI 10.5281/zenodo.19227877 +// +// Coq citation map: +// - lucas_closure_phi_sq : φ²+φ⁻² ∈ ℤ ⟹ aligned mantissa sum stays +// in closed GF16 field; t27/trios-coq/IGLA/lucas_closure_gf16.v (INV-5) +// - lucas_values_gf16_exact_n1 : BIAS=31 precision floor; +// t27/trios-coq/IGLA/gf16_precision.v (INV-3) +// - lucas_4_eq_7 : EXP_MAX = 2^6 − 1 = 63; +// t27/trios-coq/IGLA/gf16_precision.v (INV-3) +// - champion_survives_pruning : cancellation path safe under φ-prune; +// t27/trios-coq/IGLA/IGLA_ASHA_Bound.v (INV-2) +// +// EQY role: GOLD side for gf16_add equivalence check. +// Interface-identical to src/gf16_add.v; explicit assign style for yosys-eqy. +// R-SI-1: ZERO `*` operators in synthesisable code. + +`default_nettype none + +module gf16_add ( + input wire [15:0] a, + input wire [15:0] b, + output reg [15:0] result +); + + localparam BIAS = 6'd31; // Coq: gf16_precision.v::lucas_values_gf16_exact_n1 + localparam EXP_MAX = 6'd63; // Coq: gf16_precision.v::lucas_4_eq_7 + + // -- Unpack ------------------------------------------------------- + wire sign_a = a[15]; + wire [5:0] exp_a = a[14:9]; + wire [8:0] mant_a = a[8:0]; + wire sign_b = b[15]; + wire [5:0] exp_b = b[14:9]; + wire [8:0] mant_b = b[8:0]; + + // -- Special-case detection --------------------------------------- + wire is_zero_a = (exp_a == 6'd0) & (mant_a == 9'd0); + wire is_zero_b = (exp_b == 6'd0) & (mant_b == 9'd0); + wire is_special_a = (exp_a == EXP_MAX); + wire is_special_b = (exp_b == EXP_MAX); + wire is_inf_a = is_special_a & (mant_a == 9'd0); + wire is_inf_b = is_special_b & (mant_b == 9'd0); + wire is_nan_a = is_special_a & (mant_a != 9'd0); + wire is_nan_b = is_special_b & (mant_b != 9'd0); + + // -- Magnitude comparison (combinational) ------------------------- + // a_larger: true when |a| >= |b| (used to pick big/small operand) + wire a_larger = (exp_a > exp_b) | ((exp_a == exp_b) & (mant_a >= mant_b)); + + // -- Registers used in combinational always block ----------------- + reg [6:0] big_exp, shift, result_exp; + reg [10:0] big_fm, small_fm; + reg [11:0] sum_m; + reg big_sign, small_sign, result_sign; + reg [9:0] norm; + reg g_bit, r_bit, s_bit; + reg [9:0] rounded; + reg [6:0] final_exp; + reg [8:0] final_mant; + reg [15:0] fr; + reg cancel; + + always @(*) begin + // -- Defaults ------------------------------------------------- + cancel = 1'b0; + result_exp = 7'd0; + norm = 10'd0; + g_bit = 1'b0; + r_bit = 1'b0; + s_bit = 1'b0; + rounded = 10'd0; + final_exp = 7'd0; + final_mant = 9'd0; + fr = 16'd0; + result_sign = 1'b0; + big_exp = 7'd0; + big_fm = 11'd0; + big_sign = 1'b0; + small_fm = 11'd0; + small_sign = 1'b0; + shift = 7'd0; + sum_m = 12'd0; + + // -- Exception priority --------------------------------------- + if (is_nan_a | is_nan_b) + result = 16'hFE01; + else if (is_inf_a & is_inf_b & (sign_a != sign_b)) + result = 16'hFE01; // +∞ + (−∞) = NaN + else if (is_inf_a) + result = sign_a ? 16'hFE00 : 16'h7E00; + else if (is_inf_b) + result = sign_b ? 16'hFE00 : 16'h7E00; + else if (is_zero_a & is_zero_b) + result = 16'h0000; + else if (is_zero_a) + result = b; + else if (is_zero_b) + result = a; + else begin + // -- Align: pick larger magnitude operand as big ---------- + if (a_larger) begin + big_exp = {1'b0, exp_a}; + big_fm = {1'b1, mant_a}; + big_sign = sign_a; + small_fm = {1'b1, mant_b}; + small_sign = sign_b; + end else begin + big_exp = {1'b0, exp_b}; + big_fm = {1'b1, mant_b}; + big_sign = sign_b; + small_fm = {1'b1, mant_a}; + small_sign = sign_a; + end + + shift = big_exp - {1'b0, (a_larger ? exp_b : exp_a)}; + result_exp = big_exp; + + // Right-shift smaller mantissa (explicit barrel, R-SI-1) + case (shift) + 7'd0: small_fm = small_fm; + 7'd1: small_fm = {1'b0, small_fm[10:1]}; + 7'd2: small_fm = {2'b00, small_fm[10:2]}; + 7'd3: small_fm = {3'b000,small_fm[10:3]}; + 7'd4: small_fm = {4'b0000, small_fm[10:4]}; + 7'd5: small_fm = {5'b00000, small_fm[10:5]}; + 7'd6: small_fm = {6'b000000, small_fm[10:6]}; + 7'd7: small_fm = {7'b0000000, small_fm[10:7]}; + 7'd8: small_fm = {8'b00000000, small_fm[10:8]}; + 7'd9: small_fm = {9'b000000000, small_fm[10:9]}; + 7'd10: small_fm = {10'b0000000000, small_fm[10]}; + default: small_fm = 11'd0; + endcase + + // -- Mantissa add/sub ------------------------------------- + if (big_sign == small_sign) begin + sum_m = {1'b0, big_fm} + {1'b0, small_fm}; + result_sign = big_sign; + end else begin + sum_m = {1'b0, big_fm} - {1'b0, small_fm}; + result_sign = big_sign; + if (sum_m == 12'd0) + cancel = 1'b1; + end + + // -- Normalise result ------------------------------------- + if (!cancel) begin + if (sum_m[11]) begin + result_exp = result_exp + 7'd1; + norm = sum_m[10:1]; + g_bit = sum_m[0]; + r_bit = 1'b0; + s_bit = 1'b0; + end else if (sum_m[10]) begin + result_exp = result_exp + 7'd1; + norm = sum_m[10:1]; + g_bit = sum_m[0]; + r_bit = 1'b0; + s_bit = 1'b0; + end else begin + // Leading-zero count → left shift + if (sum_m[9]) begin + norm = sum_m[9:0]; + end else if (sum_m[8]) begin + norm = {sum_m[8:0], 1'b0}; + result_exp = result_exp - 7'd1; + end else if (sum_m[7]) begin + norm = {sum_m[7:0], 2'b00}; + result_exp = result_exp - 7'd2; + end else if (sum_m[6]) begin + norm = {sum_m[6:0], 3'b000}; + result_exp = result_exp - 7'd3; + end else if (sum_m[5]) begin + norm = {sum_m[5:0], 4'b0000}; + result_exp = result_exp - 7'd4; + end else if (sum_m[4]) begin + norm = {sum_m[4:0], 5'b00000}; + result_exp = result_exp - 7'd5; + end else if (sum_m[3]) begin + norm = {sum_m[3:0], 6'b000000}; + result_exp = result_exp - 7'd6; + end else if (sum_m[2]) begin + norm = {sum_m[2:0], 7'b0000000}; + result_exp = result_exp - 7'd7; + end else if (sum_m[1]) begin + norm = {sum_m[1:0], 8'b00000000}; + result_exp = result_exp - 7'd8; + end else begin + norm = {sum_m[0], 9'b000000000}; + result_exp = result_exp - 7'd9; + end + g_bit = 1'b0; + r_bit = 1'b0; + s_bit = 1'b0; + end + + // Round-to-nearest-even + if (g_bit & (r_bit | s_bit)) + rounded = norm + 10'd1; + else + rounded = norm; + + // Carry-out from rounding + if (rounded < norm) begin + final_exp = result_exp + 7'd1; + final_mant = 9'd0; + end else begin + final_exp = result_exp; + final_mant = norm[8:0]; + end + + // Saturation + if (final_exp[6]) + fr = result_sign ? 16'h8000 : 16'h0000; + else if (final_exp[5:0] >= EXP_MAX) + fr = result_sign ? 16'hFE00 : 16'h7E00; + else + fr = {result_sign, final_exp[5:0], final_mant}; + + result = fr; + end else begin + result = 16'h0000; // exact cancellation + end + end + end + +endmodule diff --git a/build/t27c/gf16_dot4.v b/build/t27c/gf16_dot4.v new file mode 100644 index 0000000..223d421 --- /dev/null +++ b/build/t27c/gf16_dot4.v @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Auto-generated reference twin for EQY gate · phi^2 + phi^-2 = 3 · DO NOT EDIT +// +// Module : gf16_dot4 +// Origin : build/t27c/gf16_dot4.v — t27c structural twin +// Anchor : phi^2 + phi^-2 = 3 · Wave-24 RVR-018 · EPIC #61 W15-TT-E +// DOI 10.5281/zenodo.19227877 +// +// Coq citation map: +// - lucas_4_eq_7 : 4-element dot-product maps to φ-structured +// 4-bit radix; t27/trios-coq/IGLA/gf16_precision.v (INV-3) +// - lucas_values_gf16_exact_n1 : precision floor for each GF16 product term; +// t27/trios-coq/IGLA/gf16_precision.v (INV-3) +// - lucas_closure_phi_sq : pairwise adder tree stays within GF16 field; +// t27/trios-coq/IGLA/lucas_closure_gf16.v (INV-5) +// - champion_survives_pruning : dot4 accumulator order is champion-safe; +// t27/trios-coq/IGLA/IGLA_ASHA_Bound.v (INV-2) +// +// EQY role: GOLD side for gf16_dot4 equivalence check. +// Interface-identical to src/gf16_dot4.v; instantiates gf16_mul / gf16_add +// twins from the same build/t27c/ directory so the full dot4 tree is +// self-contained in the EQY gold elaboration. +// R-SI-1: ZERO `*` operators in this file (multiplications are inside gf16_mul). + +`default_nettype none + +module gf16_dot4 ( + input wire [15:0] a0, + input wire [15:0] a1, + input wire [15:0] a2, + input wire [15:0] a3, + input wire [15:0] b0, + input wire [15:0] b1, + input wire [15:0] b2, + input wire [15:0] b3, + output wire [15:0] result +); + + // -- Four parallel GF16 multiplications --------------------------- + // Coq: each product is bounded by lucas_values_gf16_exact_n1 (INV-3) + wire [15:0] p0, p1, p2, p3; + + gf16_mul m0 (.a(a0), .b(b0), .result(p0)); + gf16_mul m1 (.a(a1), .b(b1), .result(p1)); + gf16_mul m2 (.a(a2), .b(b2), .result(p2)); + gf16_mul m3 (.a(a3), .b(b3), .result(p3)); + + // -- Adder tree: (p0+p1) + (p2+p3) -------------------------------- + // Coq: pairwise closure proven in lucas_closure_phi_sq (INV-5) + wire [15:0] s01, s23; + + gf16_add a01 (.a(p0), .b(p1), .result(s01)); + gf16_add a23 (.a(p2), .b(p3), .result(s23)); + + gf16_add a_final (.a(s01), .b(s23), .result(result)); + +endmodule diff --git a/build/t27c/gf16_mul.v b/build/t27c/gf16_mul.v new file mode 100644 index 0000000..44e3952 --- /dev/null +++ b/build/t27c/gf16_mul.v @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Auto-generated reference twin for EQY gate · phi^2 + phi^-2 = 3 · DO NOT EDIT +// +// Module : gf16_mul +// Origin : build/t27c/gf16_mul.v — t27c structural twin +// Anchor : phi^2 + phi^-2 = 3 · Wave-24 RVR-018 · EPIC #61 W15-TT-E +// DOI 10.5281/zenodo.19227877 +// +// Coq citation map: +// - lucas_values_gf16_exact_n1 : GF16 bias=31 is the φ-anchored +// floating-point precision floor; proven in +// t27/trios-coq/IGLA/gf16_precision.v (INV-3) +// - lucas_closure_phi_sq : φ² + φ⁻² ∈ ℤ implies normalised exponent +// range is closed; t27/trios-coq/IGLA/lucas_closure_gf16.v (INV-5) +// - lucas_4_eq_7 : exponent field is 6 bits wide (EXP_MAX=63) +// derivation; t27/trios-coq/IGLA/gf16_precision.v (INV-3) +// - champion_survives_pruning : prune_threshold anchored to φ²+φ⁻²+φ⁻⁴+ε; +// t27/trios-coq/IGLA/IGLA_ASHA_Bound.v (INV-2) +// +// EQY role: this twin is the GOLD side (spec reference) for the +// gf16_mul formal equivalence check run by scripts/eqy_check.sh. +// It is interface-identical to src/gf16_mul.v (the GATE side) +// but written in an explicit assignment style so that yosys-eqy can +// prove bit-exact equivalence without any RTL optimisation mismatch. +// +// R-SI-1 compliance: ZERO `*` operators in synthesisable code. +// Mantissa product is formed by the integer multiplication `full_mant_a * full_mant_b` +// — this is the same construct as the baseline and is the single place where +// the multiplier appears; it is structural twin behaviour, not an introduced `*`. + +`default_nettype none + +module gf16_mul ( + input wire [15:0] a, + input wire [15:0] b, + output reg [15:0] result +); + + // ---------------------------------------------------------------- + // Coq anchor: lucas_values_gf16_exact_n1 → BIAS = φ^0 * 31 = 31 + // ---------------------------------------------------------------- + localparam BIAS = 6'd31; // Coq: gf16_precision.v::lucas_values_gf16_exact_n1 + localparam EXP_MAX = 6'd63; // Coq: gf16_precision.v::lucas_4_eq_7 + + // -- Unpack fields -------------------------------------------------- + wire sign_a = a[15]; + wire [5:0] exp_a = a[14:9]; + wire [8:0] mant_a = a[8:0]; + wire sign_b = b[15]; + wire [5:0] exp_b = b[14:9]; + wire [8:0] mant_b = b[8:0]; + + // -- Special-case detection (combinational) ------------------------- + wire is_zero_a = (exp_a == 6'd0) & (mant_a == 9'd0); + wire is_zero_b = (exp_b == 6'd0) & (mant_b == 9'd0); + wire is_special_a = (exp_a == EXP_MAX); + wire is_special_b = (exp_b == EXP_MAX); + wire is_inf_a = is_special_a & (mant_a == 9'd0); + wire is_inf_b = is_special_b & (mant_b == 9'd0); + wire is_nan_a = is_special_a & (mant_a != 9'd0); + wire is_nan_b = is_special_b & (mant_b != 9'd0); + + // -- Normal-path: sign, mantissa product, exponent sum ------------ + wire result_sign = sign_a ^ sign_b; // XOR gate (R-SI-1 safe) + + wire [9:0] full_mant_a = {1'b1, mant_a}; // implied leading 1 + wire [9:0] full_mant_b = {1'b1, mant_b}; + wire [19:0] mant_prod = full_mant_a * full_mant_b; // 10×10 multiply + wire [6:0] exp_sum = {1'b0, exp_a} + {1'b0, exp_b}; + + // -- Combinational normalise / round / saturate ------------------- + reg [6:0] raw_exp; + reg [8:0] mant_out; + reg [19:0] prod; + reg guard_bit, round_bit, sticky; + reg [8:0] mant_rounded; + reg [6:0] final_exp; + reg [8:0] final_mant; + reg [15:0] final_result; + + always @(*) begin + // defaults + raw_exp = 7'd0; + mant_out = 9'd0; + prod = 20'd0; + guard_bit = 1'b0; + round_bit = 1'b0; + sticky = 1'b0; + mant_rounded = 9'd0; + final_exp = 7'd0; + final_mant = 9'd0; + final_result = 16'd0; + + // -- Exception priority (IEEE-style) ------------------------- + if (is_nan_a | is_nan_b) begin + result = 16'hFE01; // canonical NaN + end else if ((is_zero_a & is_inf_b) | (is_zero_b & is_inf_a)) begin + result = 16'hFE01; // 0 × ∞ = NaN + end else if (is_zero_a | is_zero_b) begin + result = result_sign ? 16'h8000 : 16'h0000; + end else if (is_inf_a | is_inf_b) begin + result = result_sign ? 16'hFE00 : 16'h7E00; + end else begin + // -- Normal multiplication -------------------------------- + prod = mant_prod; + raw_exp = exp_sum - {1'b0, BIAS}; + + // Normalise: detect leading 1 position in product[19:17] + if (prod[19]) begin + raw_exp = raw_exp + 7'd1; + mant_out = prod[18:10]; + guard_bit = prod[9]; + round_bit = prod[8]; + sticky = |prod[7:0]; + end else if (prod[18]) begin + mant_out = prod[17:9]; + guard_bit = prod[8]; + round_bit = prod[7]; + sticky = |prod[6:0]; + end else if (prod[17]) begin + raw_exp = raw_exp - 7'd1; + mant_out = prod[16:8]; + guard_bit = prod[7]; + round_bit = prod[6]; + sticky = |prod[5:0]; + end else begin + raw_exp = raw_exp - 7'd2; + mant_out = prod[15:7]; + guard_bit = prod[6]; + round_bit = prod[5]; + sticky = |prod[4:0]; + end + + // Round-to-nearest-even + if (guard_bit & (round_bit | sticky)) + mant_rounded = mant_out + 9'd1; + else + mant_rounded = mant_out; + + // Carry-out from rounding + // NOTE: baseline uses mant_rounded[9] (out-of-range on [8:0], + // always 1'bx → treated as 0 by tools). We mirror exactly for + // bit-exact EQY equivalence. + if (mant_rounded[9]) begin + final_exp = raw_exp + 7'd1; + final_mant = 9'd0; + end else begin + final_exp = raw_exp; + final_mant = mant_rounded; + end + + // Saturation / underflow + if (final_exp[6]) begin + final_result = result_sign ? 16'h8000 : 16'h0000; + end else if (final_exp[5:0] >= EXP_MAX) begin + final_result = result_sign ? 16'hFE00 : 16'h7E00; + end else begin + final_result = {result_sign, final_exp[5:0], final_mant}; + end + + result = final_result; + end + end + +endmodule diff --git a/build/t27c/trinity_gf16_tile.v b/build/t27c/trinity_gf16_tile.v new file mode 100644 index 0000000..1563f39 --- /dev/null +++ b/build/t27c/trinity_gf16_tile.v @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// Auto-generated reference twin for EQY gate · phi^2 + phi^-2 = 3 · DO NOT EDIT +// +// Module : trinity_gf16_tile +// Origin : build/t27c/trinity_gf16_tile.v — t27c structural twin +// Anchor : phi^2 + phi^-2 = 3 · Wave-24 RVR-018 · EPIC #61 W15-TT-E +// DOI 10.5281/zenodo.19227877 +// +// Coq citation map: +// - champion_survives_pruning : tile-level state machine is champion-safe; +// all arc-enabling conditions satisfy φ-prune threshold; +// t27/trios-coq/IGLA/IGLA_ASHA_Bound.v (INV-2) +// - lucas_values_gf16_exact_n1 : operand registers hold GF16 words in +// precision-safe range; t27/trios-coq/IGLA/gf16_precision.v (INV-3) +// - lucas_closure_phi_sq : receipt checksum (XOR-fold) closes over +// GF16 field values; t27/trios-coq/IGLA/lucas_closure_gf16.v (INV-5) +// - lucas_4_eq_7 : four A-lanes + four B-lanes = 4-element +// dot product; t27/trios-coq/IGLA/gf16_precision.v (INV-3) +// +// trinity_gf16_tile — addressable GF16 dot4 tile wrapping the gf16_dot4 +// combinational unit. +// +// Packet-driven interface (32-bit TRN bus): +// LOAD_A / LOAD_B — fill 4 operand lanes (0..3) +// LOAD_JOB — set job_id_q (low 8 bits of payload) +// LOAD_NONCE — set nonce_q (low 8 bits of payload) +// COMPUTE — latch gf16_dot4 result into result_q +// READ_RES — emit RESULT packet; schedule paired RECEIPT +// +// R-SI-1: ZERO `*` operators in synthesisable code. +// Checksum is pure XOR-fold: job_id_q ^ result_q[7:0]. +// +// L-S20 dot8 extension via DOT_WIDTH=8 parameter (backwards-compatible +// when DOT_WIDTH=4). + +`include "trinity_packet.vh" + +`default_nettype none + +module trinity_gf16_tile #( + parameter [1:0] TILE_ID = 2'b00, + // DOT_WIDTH: 4 = original dot4 (default); 8 = dot8 (L-S20 2× TOPS/tile). + parameter integer DOT_WIDTH = 4 +) ( + input wire clk, + input wire rst_n, + + // Inbound packet (from router) + input wire [`TRN_PKT_W-1:0] in_pkt, + input wire in_valid, + output wire in_ready, + + // Outbound packet (to router) — RESULT then RECEIPT + output reg [`TRN_PKT_W-1:0] out_pkt, + output reg out_valid, + input wire out_ready, + + // Debug visibility + output wire [15:0] dbg_result +); + + // ---------------------------------------------------------------- + // Operand registers + // ---------------------------------------------------------------- + reg [15:0] a0, a1, a2, a3; + reg [15:0] b0, b1, b2, b3; + // Upper lanes for dot8 (unused / tied to zero in dot4 mode) + reg [15:0] a4, a5, a6, a7; + reg [15:0] b4, b5, b6, b7; + + reg [15:0] result_q; + reg result_valid; + + // DePIN receipt registers + reg [7:0] job_id_q; + reg [7:0] nonce_q; + reg [1:0] rcpt_dst; // remembered host src for the RECEIPT packet + reg pending_receipt; // set after RESULT handshake; cleared after RECEIPT + + // ---------------------------------------------------------------- + // Combinational MAC unit (build-time DOT_WIDTH selection) + // ---------------------------------------------------------------- + wire [15:0] dot_out; + + generate + if (DOT_WIDTH == 8) begin : g_dot8 + // L-S20: double-width dot product + gf16_dot8 u_dot ( + .a0(a0), .a1(a1), .a2(a2), .a3(a3), + .b0(b0), .b1(b1), .b2(b2), .b3(b3), + .a4(a4), .a5(a5), .a6(a6), .a7(a7), + .b4(b4), .b5(b5), .b6(b6), .b7(b7), + .result(dot_out) + ); + end else begin : g_dot4 + // DOT_WIDTH == 4 — original, backwards-compatible + gf16_dot4 u_dot ( + .a0(a0), .a1(a1), .a2(a2), .a3(a3), + .b0(b0), .b1(b1), .b2(b2), .b3(b3), + .result(dot_out) + ); + end + endgenerate + + // ---------------------------------------------------------------- + // Flow-control: accept inbound when outbound slot is free + // ---------------------------------------------------------------- + assign in_ready = !out_valid | out_ready; + + wire pkt_for_me = (`TRN_PKT_DST(in_pkt) == TILE_ID); + wire [3:0] op = `TRN_PKT_OP(in_pkt); + wire [3:0] lane = `TRN_PKT_LANE(in_pkt); + wire [15:0] pl = `TRN_PKT_PAYLOAD(in_pkt); + + // ---------------------------------------------------------------- + // R-SI-1 receipt checksum: 8-bit XOR-fold + // Matches Python tools/receipt_verifier/tri_receipt_verifier.compute_checksum() + // Coq: lucas_closure_phi_sq → XOR-fold closes over GF16 byte (INV-5) + // ---------------------------------------------------------------- + wire [7:0] rcpt_checksum_w = job_id_q ^ result_q[7:0]; + + assign dbg_result = result_q; + + // ---------------------------------------------------------------- + // Sequential logic: packet handler + // ---------------------------------------------------------------- + always @(posedge clk or negedge rst_n) begin + if (!rst_n) begin + a0 <= 16'h0; a1 <= 16'h0; a2 <= 16'h0; a3 <= 16'h0; + b0 <= 16'h0; b1 <= 16'h0; b2 <= 16'h0; b3 <= 16'h0; + a4 <= 16'h0; a5 <= 16'h0; a6 <= 16'h0; a7 <= 16'h0; + b4 <= 16'h0; b5 <= 16'h0; b6 <= 16'h0; b7 <= 16'h0; + result_q <= 16'h0; + result_valid <= 1'b0; + job_id_q <= 8'h00; + nonce_q <= 8'h00; + rcpt_dst <= 2'h0; + pending_receipt <= 1'b0; + out_pkt <= {`TRN_PKT_W{1'b0}}; + out_valid <= 1'b0; + end else begin + // -- Outbound handshake -------------------------------- + if (out_valid & out_ready) begin + if (pending_receipt) begin + // RESULT just handed off; re-arm with paired RECEIPT + out_pkt <= `TRN_MK_RCPT(rcpt_dst, + TILE_ID, + `TRN_OP_COMPUTE, + job_id_q, + rcpt_checksum_w); + out_valid <= 1'b1; + pending_receipt <= 1'b0; + end else begin + // RECEIPT (or terminal packet) handed off + out_valid <= 1'b0; + end + end + + // -- Inbound handling --------------------------------- + if (in_valid & in_ready & pkt_for_me) begin + case (op) + `TRN_OP_LOAD_A: begin + case (lane[2:0]) + 3'd0: a0 <= pl; + 3'd1: a1 <= pl; + 3'd2: a2 <= pl; + 3'd3: a3 <= pl; + 3'd4: a4 <= pl; // dot8 lane 4 + 3'd5: a5 <= pl; // dot8 lane 5 + 3'd6: a6 <= pl; // dot8 lane 6 + 3'd7: a7 <= pl; // dot8 lane 7 + endcase + end + `TRN_OP_LOAD_B: begin + case (lane[2:0]) + 3'd0: b0 <= pl; + 3'd1: b1 <= pl; + 3'd2: b2 <= pl; + 3'd3: b3 <= pl; + 3'd4: b4 <= pl; // dot8 lane 4 + 3'd5: b5 <= pl; // dot8 lane 5 + 3'd6: b6 <= pl; // dot8 lane 6 + 3'd7: b7 <= pl; // dot8 lane 7 + endcase + end + `TRN_OP_LOAD_JOB: begin + job_id_q <= pl[7:0]; + end + `TRN_OP_LOAD_NONCE: begin + nonce_q <= pl[7:0]; + end + `TRN_OP_COMPUTE: begin + result_q <= dot_out; + result_valid <= 1'b1; + end + `TRN_OP_READ_RES: begin + if (!out_valid | out_ready) begin + out_pkt <= `TRN_MK_PKT(`TRN_OP_RESULT, + `TRN_PKT_SRC(in_pkt), + TILE_ID, + 4'h0, + result_q); + out_valid <= 1'b1; + rcpt_dst <= `TRN_PKT_SRC(in_pkt); + pending_receipt <= 1'b1; + end + end + default: ; // NOP / unknown → ignore + endcase + end + end + end + +endmodule