From 3250913b9cbce020bb543b05f592656a877a5828 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Thu, 23 Jan 2025 19:53:10 +0000
Subject: [PATCH 1/4] Avoid clock assignment for reliably simulation.

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 0ee84b2f79..cdc770d5c3 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -129,8 +129,6 @@ module mvu_vvu_axi #(
 		end
 	end
 
-	uwire  clk = ap_clk;
-	uwire  clk2x = ap_clk2x;
 	uwire  rst = !ap_rst_n;
 
 	//- Replay to Accommodate Neuron Fold -----------------------------------
@@ -144,7 +142,7 @@ module mvu_vvu_axi #(
 	localparam int unsigned  SF = MW/SIMD;
 	localparam int unsigned  NF = MH/PE;
 	replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay (
-		.clk, .rst,
+		.clk(ap_clk), .rst,
 		.ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)),
 		.ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
 	);
@@ -190,7 +188,6 @@ module mvu_vvu_axi #(
 		typedef logic [PE    -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH    -1:0]  dsp_w_t;
 		typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0]  dsp_a_t;
 
-		uwire  dsp_clk;
 		uwire  dsp_en;
 
 		uwire  dsp_last;
@@ -202,8 +199,7 @@ module mvu_vvu_axi #(
 		uwire dsp_p_t  dsp_p;
 
 		if(!PUMPED_COMPUTE) begin : genUnpumpedCompute
-			assign	dsp_clk = clk;
-			assign	dsp_en  = en;
+			assign	dsp_en = en;
 
 			assign	dsp_last = alast && avld;
 			assign	dsp_zero = !istb;
@@ -214,15 +210,14 @@ module mvu_vvu_axi #(
 			assign	odat = dsp_p;
 		end : genUnpumpedCompute
 		else begin : genPumpedCompute
-			assign	dsp_clk = clk2x;
 
 			// Identify second fast cycle just before active slow clock edge
 			logic  Active = 0;
 			if(1) begin : blkActive
 				uwire  clk_lut[2];	// Put some LUT delay on the input from the fast clock net
-				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk));
+				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(ap_clk));
 				(* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0]));
-				always_ff @(posedge clk2x)  Active <= clk_lut[1];
+				always_ff @(posedge ap_clk2x)  Active <= clk_lut[1];
 			end : blkActive
 
 			// The input for a slow cycle is split across two fast cycles along the SIMD dimension.
@@ -237,7 +232,7 @@ module mvu_vvu_axi #(
 				for(genvar  i =    0; i <       SIMD; i++)  assign  w[i] = mvu_w[pe][i];
 				for(genvar  i = SIMD; i < 2*DSP_SIMD; i++)  assign  w[i] = 0;
 
-				always_ff @(posedge clk2x) begin
+				always_ff @(posedge ap_clk2x) begin
 					if(rst)      W[pe] <= 'x;
 					else if(en)  W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD];
 				end
@@ -251,7 +246,7 @@ module mvu_vvu_axi #(
 				for(genvar  i =    0; i <       SIMD; i++)  assign  a[i] = amvau_i[pe][i];
 				for(genvar  i = SIMD; i < 2*DSP_SIMD; i++)  assign  a[i] = 0;
 
-				always_ff @(posedge clk2x) begin
+				always_ff @(posedge ap_clk2x) begin
 					if(rst)      A[pe] <= 'x;
 					else if(en)  A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD];
 				end
@@ -260,7 +255,7 @@ module mvu_vvu_axi #(
 
 			logic  Zero = 1;
 			logic  Last = 0;
-			always_ff @(posedge clk2x) begin
+			always_ff @(posedge ap_clk2x) begin
 				if(rst) begin
 					Zero <= 1;
 					Last <= 0;
@@ -283,7 +278,7 @@ module mvu_vvu_axi #(
 			// clock to pick it up.
 			logic    Vld = 0;
 			dsp_p_t  P = 'x;
-			always_ff @(posedge clk2x) begin
+			always_ff @(posedge ap_clk2x) begin
 				if(rst) begin
 					Vld <= 0;
 					P   <= 'x;
@@ -307,7 +302,7 @@ module mvu_vvu_axi #(
 				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 				.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
 			) core (
-				.clk(dsp_clk), .rst, .en(dsp_en),
+				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en),
 				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
 				.vld(dsp_vld), .p(dsp_p)
 			);
@@ -318,7 +313,7 @@ module mvu_vvu_axi #(
 				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
 				.VERSION(1), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
 			) core (
-				.clk(dsp_clk), .rst, .en(dsp_en),
+				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en),
 				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
 				.vld(dsp_vld), .p(dsp_p)
 			);
@@ -329,7 +324,7 @@ module mvu_vvu_axi #(
 				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
 				.VERSION(2), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
 			) core (
-				.clk(dsp_clk), .rst, .en(dsp_en),
+				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en),
 				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
 				.vld(dsp_vld), .p(dsp_p)
 			);
@@ -339,7 +334,7 @@ module mvu_vvu_axi #(
 				.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
 				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
 			) core (
-				.clk(dsp_clk), .rst, .en(dsp_en),
+				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en),
 				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
 				.vld(dsp_vld), .p(dsp_p)
 			);
@@ -366,7 +361,7 @@ module mvu_vvu_axi #(
 	assign	en = A.rdy;
 	uwire  b_load = !B.vld || m_axis_output_tready;
 
-	always_ff @(posedge clk) begin
+	always_ff @(posedge ap_clk) begin
 		if(rst) begin
 			A <= '{ rdy: 1, default: 'x };
 			B <= '{ vld: 0, default: 'x };

From d6cd7562f9cbff0bdb2516ea9fac6f75e2df0634 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Mon, 27 Jan 2025 19:39:05 +0000
Subject: [PATCH 2/4] Free running DSP cores without enables.

---
 finn-rtllib/mvu/mvu_4sx4u.sv          |   8 ++
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv    |   8 ++
 finn-rtllib/mvu/mvu_pkg.sv            |  22 +++++
 finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv |  11 ++-
 finn-rtllib/mvu/mvu_vvu_axi.sv        | 122 ++++++++++++++------------
 5 files changed, 115 insertions(+), 56 deletions(-)
 create mode 100644 finn-rtllib/mvu/mvu_pkg.sv

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 1f6e97281e..5541ee74f9 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -58,6 +58,8 @@ module mvu_4sx4u #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
+	import  mvu_pkg::*;
+
 	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
@@ -129,6 +131,12 @@ module mvu_4sx4u #(
 		else if(en)  L <= { last, L[1:4] };
 	end
 	assign	vld = L[5];
+	initial begin
+		if(mvu_pipeline_depth("mvu_4sx4u") < $bits(L)) begin
+			$error("%m: Outdated pipeline depth computation.");
+			$stop;
+		end
+	end
 
 	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
 	localparam int unsigned  PIPE_COUNT = (PE+3)/4;
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index dabb36647e..fbe85b624f 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -57,6 +57,8 @@ module mvu_8sx8u_dsp48 #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
+	import  mvu_pkg::*;
+
 	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
@@ -83,6 +85,12 @@ module mvu_8sx8u_dsp48 #(
 		else if(en)  L <= { last, L[1:4] };
 	end
 	assign	vld = L[5];
+	initial begin
+		if(mvu_pipeline_depth("mvu_8sx8u_dsp48") < $bits(L)) begin
+			$error("%m: Outdated pipeline depth computation.");
+			$stop;
+		end
+	end
 
 	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
 	localparam int unsigned  SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH;
diff --git a/finn-rtllib/mvu/mvu_pkg.sv b/finn-rtllib/mvu/mvu_pkg.sv
new file mode 100644
index 0000000000..fa613b2aae
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_pkg.sv
@@ -0,0 +1,22 @@
+package mvu_pkg;
+	function int unsigned mvu_pipeline_depth(
+		input string        core,
+		input int unsigned  simd   = 0,
+		input int unsigned  seglen = 0
+	);
+		unique case(core)
+		"mvu_vvu_8sx9_dsp58": begin
+			automatic int  chainlen = (simd+2)/3;
+			if(seglen == 0)  seglen = chainlen;
+			return  3 + (chainlen-1)/seglen;
+		end
+		"mvu_4sx4u", "mvu_4sx4u_dsp48e1", "mvu_4sx4u_dsp48e2",
+		"mvu_8sx8u_dsp48":
+			return  5;
+		default: begin
+			$error("Unknown MVU core '%s'", core);
+			$finish;
+		end
+		endcase
+	endfunction : mvu_pipeline_depth
+endpackage : mvu_pkg
diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
index 2734f37cf3..7515b2e868 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -45,8 +45,7 @@ module mvu_vvu_8sx9_dsp58 #(
 
 	localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD,
 	localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD
-  )
-  (
+  )  (
     // Global Control
 	input   logic clk,
     input   logic rst,
@@ -62,6 +61,8 @@ module mvu_vvu_8sx9_dsp58 #(
 	output  logic vld,
     output  logic [PE-1:0][ACCU_WIDTH-1:0] p
   );
+	import  mvu_pkg::*;
+
 	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
@@ -90,6 +91,12 @@ module mvu_vvu_8sx9_dsp58 #(
 		end
 	end
 	assign vld = L[0];
+	initial begin
+		if(mvu_pipeline_depth("mvu_vvu_8sx9_dsp58", SIMD, SEGMENTLEN) < $bits(L)) begin
+			$error("%m: Outdated pipeline depth computation.");
+			$stop;
+		end
+	end
 
 //-------------------- Shift register for ZERO flag --------------------\\
 	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index cdc770d5c3..00511bb167 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -91,6 +91,7 @@ module mvu_vvu_axi #(
 	output	logic  m_axis_output_tvalid,
 	input	logic  m_axis_output_tready
 );
+	import  mvu_pkg::*;
 
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
@@ -173,10 +174,9 @@ module mvu_vvu_axi #(
 	end : genVVUInput
 
 	//- Flow Control Bracket around Compute Core ----------------------------
-	uwire en;
-	uwire istb = avld && s_axis_weights_tvalid;
-	assign ardy = en && s_axis_weights_tvalid;
-	assign s_axis_weights_tready = en && avld;
+	uwire  idle;
+	assign	ardy = !idle && s_axis_weights_tvalid;
+	assign	s_axis_weights_tready = !idle && avld;
 
 	//- Conditionally Pumped DSP Compute ------------------------------------
 	typedef logic [PE-1:0][ACCU_WIDTH-1:0]  dsp_p_t;
@@ -188,8 +188,6 @@ module mvu_vvu_axi #(
 		typedef logic [PE    -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH    -1:0]  dsp_w_t;
 		typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0]  dsp_a_t;
 
-		uwire  dsp_en;
-
 		uwire  dsp_last;
 		uwire  dsp_zero;
 		uwire dsp_w_t  dsp_w;
@@ -199,10 +197,8 @@ module mvu_vvu_axi #(
 		uwire dsp_p_t  dsp_p;
 
 		if(!PUMPED_COMPUTE) begin : genUnpumpedCompute
-			assign	dsp_en = en;
-
-			assign	dsp_last = alast && avld;
-			assign	dsp_zero = !istb;
+			assign	dsp_last = alast && avld && !idle;
+			assign	dsp_zero = idle || !s_axis_weights_tvalid || !avld;
 			assign	dsp_w = mvu_w;
 			assign	dsp_a = amvau_i;
 
@@ -261,12 +257,11 @@ module mvu_vvu_axi #(
 					Last <= 0;
 				end
 				else if(en) begin
-					Zero <= !istb;
-					Last <= alast && avld && Active;
+					Zero <= idle || !s_axis_weights_tvalid || !avld;
+					Last <= alast && avld && !idle && Active;
 				end
 			end
 
-			assign	dsp_en = en;
 			assign	dsp_last = Last;
 			assign	dsp_zero = Zero;
 			assign	dsp_w = W;
@@ -294,7 +289,7 @@ module mvu_vvu_axi #(
 		end : genPumpedCompute
 
 		case(COMPUTE_CORE)
-		"mvu_vvu_8sx9_dsp58":
+		"mvu_vvu_8sx9_dsp58": begin : core
 			mvu_vvu_8sx9_dsp58 #(
 				.IS_MVU(IS_MVU),
 				.PE(PE), .SIMD(DSP_SIMD),
@@ -302,42 +297,46 @@ module mvu_vvu_axi #(
 				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
 				.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
 			) core (
-				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en),
+				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en('1),
 				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
 				.vld(dsp_vld), .p(dsp_p)
 			);
-		"mvu_4sx4u_dsp48e1":
+		end
+		"mvu_4sx4u_dsp48e1": begin : core
 			mvu_4sx4u #(
 				.PE(PE), .SIMD(DSP_SIMD),
 				.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
 				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
 				.VERSION(1), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
 			) core (
-				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en),
+				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en('1),
 				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
 				.vld(dsp_vld), .p(dsp_p)
 			);
-		"mvu_4sx4u_dsp48e2":
+		end
+		"mvu_4sx4u_dsp48e2": begin : core
 			mvu_4sx4u #(
 				.PE(PE), .SIMD(DSP_SIMD),
 				.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
 				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
 				.VERSION(2), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
 			) core (
-				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en),
+				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en('1),
 				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
 				.vld(dsp_vld), .p(dsp_p)
 			);
-		"mvu_8sx8u_dsp48":
+		end
+		"mvu_8sx8u_dsp48": begin : core
 			mvu_8sx8u_dsp48 #(
 				.PE(PE), .SIMD(DSP_SIMD),
 				.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
 				.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
 			) core (
-				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en),
+				.clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en('1),
 				.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
 				.vld(dsp_vld), .p(dsp_p)
 			);
+		end
 		default: initial begin
 			$error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
 			$finish;
@@ -346,41 +345,56 @@ module mvu_vvu_axi #(
 
 	end : blkDsp
 
-//-------------------- Output register slice --------------------\\
-	// Make `en`computation independent from external inputs.
-	// Drive all outputs from registers.
-	struct packed {
-		logic rdy;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	}  A = '{ rdy: 1, default: 'x };	// side-step register used when encountering backpressure
-	struct packed {
-		logic vld;
-		logic [PE-1:0][ACCU_WIDTH-1:0] dat;
-	}  B = '{ vld: 0, default: 'x };	// ultimate output register
-
-	assign	en = A.rdy;
-	uwire  b_load = !B.vld || m_axis_output_tready;
-
-	always_ff @(posedge ap_clk) begin
-		if(rst) begin
-			A <= '{ rdy: 1, default: 'x };
-			B <= '{ vld: 0, default: 'x };
+	if(1) begin : blkOutput
+		localparam int unsigned  PIPELINE_DEPTH = mvu_pipeline_depth(COMPUTE_CORE, SIMD, SEGMENTLEN);
+		localparam int unsigned  MIN_OUT_PERIOD = MW / SIMD;
+		localparam int unsigned  MAX_IN_FLIGHT  = 1 + PIPELINE_DEPTH / MIN_OUT_PERIOD;
+
+		typedef logic [PE-1:0][ACCU_WIDTH-1:0]  output_t;
+
+		logic signed [$clog2(MAX_IN_FLIGHT+1):0]  OPtr = '1;	// -1 | 0, 1, ..., MAX_IN_FLIGHT
+		output_t  OBuf[0:MAX_IN_FLIGHT];
+		logic     OVld  =  0;
+		output_t  OReg  = 'x;
+		logic     OLock =  0;	// Lock upon backpressure (second entry into queue)
+
+		// Catch every output into (SRL) Output Queue
+		always_ff @(posedge ap_clk) begin
+			if(ovld)  OBuf <= { odat, OBuf[0:MAX_IN_FLIGHT-1] };
 		end
-		else begin
-			if(A.rdy)  A.dat <= odat;
-			A.rdy <= (A.rdy && !ovld) || b_load;
-
-			if(b_load) begin
-				B <= '{
-					vld: ovld || !A.rdy,
-					dat: A.rdy? odat : A.dat
-				};
+
+		always_ff @(posedge ap_clk) begin
+			if(rst) begin
+				OPtr  <= '1;
+				OVld  <=  0;
+				OReg  <= 'x;
+				OLock <=  0;
+			end
+			else begin
+				automatic logic  push = ovld;
+				automatic logic  pop  = (m_axis_output_tready || !OVld) && !OPtr[$left(OPtr)];
+				assert(pop || !push || (OPtr < $signed(MAX_IN_FLIGHT))) else begin
+					$error("%m: Overflowing output queue.");
+					$stop;
+				end
+				OPtr <= OPtr + $signed(push == pop? 0 : push? 1 : -1);
+
+				if(OPtr[$left(OPtr)])                   OLock <= 0;
+				else if(OVld && !m_axis_output_tready)  OLock <= 1;
+
+				if(m_axis_output_tready || !OVld) begin
+					OVld <= !OPtr[$left(OPtr)];
+					OReg <= OBuf[OPtr[$left(OPtr)-1:0]];
+				end
 			end
 		end
-	end
-	assign	m_axis_output_tvalid = B.vld;
-	// Why would we need a sign extension here potentially creating a higher signal load into the next FIFO?
-	// These extra bits should never be used. Why not 'x them out?
-	assign	m_axis_output_tdata  = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
+		assign	idle = OLock;
+
+		assign	m_axis_output_tvalid = OVld;
+		// Why would we need a sign extension here potentially creating a higher signal load into the next FIFO?
+		// These extra bits should never be used. Why not 'x them out?
+		assign	m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){OReg[PE-1][ACCU_WIDTH-1]}}, OReg };
+
+	end : blkOutput
 
 endmodule : mvu_vvu_axi

From ae0574cf57ef7d1da564143ce1f438784f2c24a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Mon, 27 Jan 2025 20:20:25 +0000
Subject: [PATCH 3/4] Remove obsolete references to `en` signal.

---
 finn-rtllib/mvu/mvu_vvu_axi.sv | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 00511bb167..368c338f37 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -229,8 +229,8 @@ module mvu_vvu_axi #(
 				for(genvar  i = SIMD; i < 2*DSP_SIMD; i++)  assign  w[i] = 0;
 
 				always_ff @(posedge ap_clk2x) begin
-					if(rst)      W[pe] <= 'x;
-					else if(en)  W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD];
+					if(rst)  W[pe] <= 'x;
+					else     W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD];
 				end
 
 			end : genPERegW
@@ -243,8 +243,8 @@ module mvu_vvu_axi #(
 				for(genvar  i = SIMD; i < 2*DSP_SIMD; i++)  assign  a[i] = 0;
 
 				always_ff @(posedge ap_clk2x) begin
-					if(rst)      A[pe] <= 'x;
-					else if(en)  A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD];
+					if(rst)  A[pe] <= 'x;
+					else     A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD];
 				end
 
 			end : genPERegA
@@ -256,7 +256,7 @@ module mvu_vvu_axi #(
 					Zero <= 1;
 					Last <= 0;
 				end
-				else if(en) begin
+				else begin
 					Zero <= idle || !s_axis_weights_tvalid || !avld;
 					Last <= alast && avld && !idle && Active;
 				end
@@ -278,7 +278,7 @@ module mvu_vvu_axi #(
 					Vld <= 0;
 					P   <= 'x;
 				end
-				else if(en) begin
+				else begin
 					if(dsp_vld)  P <= dsp_p;
 					Vld <= dsp_vld || (Vld && !Active);
 				end
@@ -353,6 +353,7 @@ module mvu_vvu_axi #(
 		typedef logic [PE-1:0][ACCU_WIDTH-1:0]  output_t;
 
 		logic signed [$clog2(MAX_IN_FLIGHT+1):0]  OPtr = '1;	// -1 | 0, 1, ..., MAX_IN_FLIGHT
+		(* SHREG_EXTRACT = "YES" *)
 		output_t  OBuf[0:MAX_IN_FLIGHT];
 		logic     OVld  =  0;
 		output_t  OReg  = 'x;

From 92bcbb3a34a0fe9ac9f5dedb0ee174cc851cb72e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= <thomas.preusser@xilinx.com>
Date: Fri, 31 Jan 2025 12:57:34 +0000
Subject: [PATCH 4/4] Abolish package for synthesis compatibiity and homogenize
 pipelining.

---
 finn-rtllib/mvu/mvu_4sx4u.sv          | 412 ++++++++++++++++++++------
 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv    | 333 +++++++++++++--------
 finn-rtllib/mvu/mvu_pkg.sv            |  22 --
 finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv |  27 +-
 finn-rtllib/mvu/mvu_vvu_axi.sv        |   9 +-
 5 files changed, 557 insertions(+), 246 deletions(-)
 delete mode 100644 finn-rtllib/mvu/mvu_pkg.sv

diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
index 5541ee74f9..5dadefbaa2 100644
--- a/finn-rtllib/mvu/mvu_4sx4u.sv
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -39,6 +39,7 @@ module mvu_4sx4u #(
 	int unsigned  ACCU_WIDTH,
 
 	int unsigned  VERSION = 1,	// Version 1 (DSP48E1) *must* commit to NARROW_WEIGHTS
+					// Allowed versions - 1: DSP48E1, 2: DSP48E2, 3: DSP58
 	bit  SIGNED_ACTIVATIONS = 0,
 	bit  NARROW_WEIGHTS   = 0,	// Weights from [-7:7] rather than [-8:7]
 	bit  FORCE_BEHAVIORAL = 0
@@ -58,9 +59,8 @@ module mvu_4sx4u #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
-	import  mvu_pkg::*;
 
-	// for verilator always use behavioral code
+	// For Verilator: always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
 		1 ||
@@ -68,10 +68,39 @@ module mvu_4sx4u #(
 		FORCE_BEHAVIORAL;
 
 	//-----------------------------------------------------------------------
-	// Determine Lane Configuration
+	// Startup Recovery Watchdog
+	//  The DSP slice needs 100ns of recovery time after initial startup before
+	//  being able to ingest input properly. This watchdog discovers violating
+	//  stimuli during simulation and produces a corresponding warning.
+	if(1) begin : blkRecoveryWatch
+		logic  Dirty = 1;
+		initial begin
+			#100ns;
+			Dirty <= 0;
+		end
+
+		always_ff @(posedge clk) begin
+			assert(!Dirty || rst || !en || zero) else begin
+				$warning("%m: Feeding input during DSP startup recovery. Expect functional errors.");
+				$stop;
+			end
+		end
+	end : blkRecoveryWatch
+
+	//-----------------------------------------------------------------------
+	// Determine version-specific constraints
+	typedef enum { DSP48E1 = 1, DSP48E2 = 2, DSP58 = 3 }  dsp_version_e;
+	localparam int unsigned  A_WIDTH = 25 + 2*(VERSION > 1);     	// Width of A datapath
+	localparam int unsigned  B_WIDTH = 18 + 6*(VERSION > 2);     	// Width of B datapath
+	localparam int unsigned  P_WIDTH = VERSION == DSP58? 58 : 48;	// Width of P datapath
+
 	initial begin
-		if(!NARROW_WEIGHTS && (VERSION == 1)) begin
-			$error("%m: Need NARROW_WEIGHTS for DSP48E1.");
+		if(WEIGHT_WIDTH > 4) begin
+			$error("%m: Requested WEIGHT_WIDTH=%0d beyond support for 4 bits.", WEIGHT_WIDTH);
+			$finish;
+		end
+		if(ACTIVATION_WIDTH > 4) begin
+			$error("%m: Requested ACTIVATION_WIDTH=%0d beyond support for 4 bits.", ACTIVATION_WIDTH);
 			$finish;
 		end
 	end
@@ -80,63 +109,87 @@ module mvu_4sx4u #(
 	 * Lane Slicing
 	 *	Assumptions:
 	 *	 - Internal lane widths differ, at most, by a single bit.
+	 *	 - The minimum lane width is WEIGHT_WIDTH + ACTIVATION_WIDTH - 1 bits
+	 *	   so as to confine cross-lane overflows to {-1,0,1}.
 	 *	 - The rightmost lane (#0) has the maximum internal width.
 	 *	 - The leftmost lane (#3) extends into the wide DSP accumulation path and
-	 *	   is constrained by ACCU_WIDTH rather than the next lane. It doesn't have
-	 *	   an external high extension.
+	 *	   is typically constrained by ACCU_WIDTH rather than the next lane. If so,
+	 *	   it doesn't have an external high extension.
 	 *	 - The one but leftmost lane (#2) has the minimum internal width and, hence,
-	 *	   the macimum external high extension.
+	 *	   the maximum external high extension.
 	 */
 	typedef int unsigned  lane_offset_v[4:0];
 	function lane_offset_v sliceLanes();
-		unique case(VERSION)
-		1: begin
-			return  NARROW_WEIGHTS?
-				lane_offset_v'{ ACCU_WIDTH+21, 21, 14, 7, 0 } :
-				lane_offset_v'{ 0, 0, 0, 0, 0 };	// not supported
+		localparam int unsigned  MIN_LANE_WIDTH = WEIGHT_WIDTH + ACTIVATION_WIDTH - 1;
+		automatic lane_offset_v  res;
+
+		// Determine number of bits beyond accommodating minimum lane width
+		automatic int  bit_slack = A_WIDTH;
+		// protect sign if not narrow, leftmost weight entry, minimum for rest of lanes
+		bit_slack -= !NARROW_WEIGHTS + WEIGHT_WIDTH + 3*MIN_LANE_WIDTH;
+		if(bit_slack < 0) begin
+			localparam  dsp_version_e  VER = dsp_version_e'(VERSION);
+			$error(
+				"%m: Cannot accommodate %0d-bit %snarrow weights on %s.",
+				WEIGHT_WIDTH, NARROW_WEIGHTS? "" : "non-", VER.name
+			);
+			$finish;
 		end
-		2: begin
-			return  NARROW_WEIGHTS?
-				lane_offset_v'{ ACCU_WIDTH+23, 23, 16, 8, 0 } :
-				lane_offset_v'{ ACCU_WIDTH+22, 22, 15, 8, 0 };
+
+		// Distribute slack bits preferring right lanes
+		res[0] = 0;
+		for(int unsigned  i = 1; i < 4; i++) begin
+			automatic int unsigned  extra = (bit_slack + (3-i)) / (4-i);
+			res[i] = res[i-1] + MIN_LANE_WIDTH + extra;
+			bit_slack -= extra;
 		end
-		endcase
+
+		// Last lane bounded by the smaller of ACCU_WIDTH or P datapath
+		res[4] = res[3] + ACCU_WIDTH;
+		if(res[4] > P_WIDTH)  res[4] = P_WIDTH;
+
+		return  res;
 	endfunction : sliceLanes
 	localparam lane_offset_v  OFFSETS = sliceLanes();
 
+	function int unsigned sum_width(input int unsigned  n, input int unsigned  w);
+		return  w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n);
+	endfunction : sum_width
 	function int unsigned lo_width(input int unsigned  i);
 		return  OFFSETS[i+1] - OFFSETS[i];
 	endfunction : lo_width
 	function int unsigned hi_width(input int unsigned  i);
-		return  1 + $clog2(2**(ACCU_WIDTH-lo_width(i)-1)+SIMD);
+		automatic int unsigned  lw = lo_width(i);
+		return	ACCU_WIDTH <= lw?
+			0 :
+			1 + ($clog2(SIMD) < ACCU_WIDTH-lw?
+					ACCU_WIDTH-lw :
+					$clog2(2**(ACCU_WIDTH-lw-1)+SIMD)
+				);
 	endfunction : hi_width
-	localparam int unsigned  LO_WIDTH_MAX = OFFSETS[1] - OFFSETS[0];
+	localparam int unsigned  LO_WIDTH_MAX = lo_width(3);
 	localparam int unsigned  HI_WIDTH_MAX = hi_width(2);
 
-	localparam int unsigned  A_WIDTH = 23 + 2*VERSION;	// Width of A datapath
-
 	// Compute the count of decendents for all nodes in the reduction trees.
-	typedef int unsigned  leave_load_t[2*SIMD-1];
-	function leave_load_t init_leave_loads();
-		automatic leave_load_t  res;
+	typedef int unsigned  leaf_load_t[2*SIMD-1];
+	function leaf_load_t init_leaf_loads();
+		automatic leaf_load_t  res;
 		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
 		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
 		return  res;
-	endfunction : init_leave_loads
+	endfunction : init_leaf_loads
 
 	// Pipeline for last indicator flag
-	logic [1:5] L = '0;
+	// Depth: 3 cycles for DSP + external SIMD reduction
+	localparam int unsigned  PIPELINE_DEPTH = 3 + $clog2(SIMD+1) + (SIMD == 1);
+/* verilator lint_off LITENDIAN */
+	logic [1:PIPELINE_DEPTH] L = '0;
+/* verilator lint_on LITENDIAN */
 	always_ff @(posedge clk) begin
 		if(rst)      L <= '0;
-		else if(en)  L <= { last, L[1:4] };
-	end
-	assign	vld = L[5];
-	initial begin
-		if(mvu_pipeline_depth("mvu_4sx4u") < $bits(L)) begin
-			$error("%m: Outdated pipeline depth computation.");
-			$stop;
-		end
+		else if(en)  L <= { last, L[1:PIPELINE_DEPTH-1] };
 	end
+	assign	vld = L[PIPELINE_DEPTH];
 
 	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
 	localparam int unsigned  PIPE_COUNT = (PE+3)/4;
@@ -146,15 +199,15 @@ module mvu_4sx4u #(
 		localparam int unsigned  PE_END = PE < 4*(c+1)? PE : 4*(c+1);
 		localparam int unsigned  PE_REM = 4*(c+1) - PE_END;
 
-		uwire        [47:0]  p3[SIMD];
-		uwire signed [ 1:0]  h3[SIMD][3];
+		uwire        [P_WIDTH-1:0]  p3[SIMD];
+		uwire signed [        1:0]  h3[SIMD][4];
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [17:0]  bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
-			logic [29:0]  aa;
-			logic [26:0]  dd;
-			logic [ 1:0]  xx[3:1];
+			uwire [B_WIDTH-1:0]  bb = { {(B_WIDTH-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
+			logic [A_WIDTH-1:0]  aa;
+			logic [A_WIDTH-1:0]  dd;
+			logic [1:0]  xx[3:1];
 			if(1) begin : blkVectorize
 				uwire signed [3:0]  ww[PE_END - PE_BEG];
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
@@ -182,23 +235,31 @@ module mvu_4sx4u #(
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 						automatic int unsigned  ofs = OFFSETS[pe + PE_REM];
-						dd[ofs+:3] = ww[pe];
-						assert(!NARROW_WEIGHTS || rst || !en || zero || (ww[pe] != -8)) else begin
-							$warning("%m: Weight of -8 violates NARROW_WEIGHTS commitment.");
-						end
+						dd[ofs+:WEIGHT_WIDTH-1] = ww[pe][0+:WEIGHT_WIDTH-1];
 
 						// The sign of the weights are generally put on the subtracted A port.
 						// However, when coinciding with the actual sign bit position of the
 						// multiplier input path, it also goes onto the D input. This prevents
 						// sign extensions that may happen when a DSP primitive is auto-promoted
 						// to a newer generation.
-						if(ofs+3 == A_WIDTH-1)  dd[ofs+3] = ww[pe][3];
-						else                    aa[ofs+3] = ww[pe][3];
+						if(ofs+WEIGHT_WIDTH-1 == A_WIDTH-1)  dd[ofs+WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+						else                                 aa[ofs+WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
 					end
 				end
+				if(NARROW_WEIGHTS) begin : genNarrowCheck
+					always_ff @(posedge clk iff en && !rst) begin
+						foreach(ww[pe]) begin
+							assert(zero || (ww[pe] !== -2**(WEIGHT_WIDTH-1))) else begin
+								$warning("%m: Weight of %0x violates NARROW_WEIGHTS commitment.", ww[pe]);
+								$stop;
+							end
+						end
+					end
+				end
+
 			end : blkVectorize
 
-			uwire [47:0]  pp;
+			uwire [P_WIDTH-1:0]  pp;
 
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
@@ -206,7 +267,7 @@ module mvu_4sx4u #(
 			if(BEHAVIORAL) begin : genBehav
 
 				// Stage #1: Input Refine
-				logic signed [17:0]  B1  = 0;
+				logic signed [B_WIDTH-1:0]  B1  = 0;
 				always_ff @(posedge clk) begin
 					if(zero)     B1  <= 0;
 					else if(en)  B1  <= bb;
@@ -219,7 +280,7 @@ module mvu_4sx4u #(
 				end
 
 				// Stage #2: Multiply
-				logic signed [45:0]  M2 = 0;
+				logic signed [A_WIDTH+B_WIDTH-1:0]  M2 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      M2 <= 0;
 					else if(en)  M2 <=
@@ -230,7 +291,7 @@ module mvu_4sx4u #(
 				end
 
 				// Stage #3: Accumulate
-				logic signed [47:0]  P3 = 0;
+				logic signed [P_WIDTH-1:0]  P3 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      P3 <= 0;
 					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
@@ -243,7 +304,7 @@ module mvu_4sx4u #(
 				localparam logic [6:0]  OPMODE_INVERSION = 7'b010_01_01;
 				uwire [6:0]  opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 };
 				case(VERSION)
-				1: DSP48E1 #(
+				DSP48E1: DSP48E1 #(
 					// Feature Control Attributes: Data Path Selection
 					.A_INPUT("DIRECT"),		// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
 					.B_INPUT("DIRECT"),		// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
@@ -307,7 +368,7 @@ module mvu_4sx4u #(
 					.OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input
 
 					// Data: 30-bit (each) input: Data Ports
-					.A(aa),			// 30-bit input: A data input
+					.A({5'b0, aa}),	// 30-bit input: A data input
 					.B(bb),			// 18-bit input: B data input
 					.C('x),			// 48-bit input: C data input
 					.CARRYIN('0),	// 1-bit input: Carry input signal
@@ -348,7 +409,7 @@ module mvu_4sx4u #(
 					.RSTM(rst),			// 1-bit input: Reset for MREG
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
-				2: DSP48E2 #(
+				DSP48E2: DSP48E2 #(
 					// Feature Control Attributes: Data Path Selection
 					.AMULTSEL("AD"),	// Selects A input to multiplier (A, AD)
 					.A_INPUT("DIRECT"),	// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
@@ -436,13 +497,147 @@ module mvu_4sx4u #(
 					.OPMODE({ 2'b00, opmode }),	// 9-bit input: Operation mode
 
 					// Data inputs: Data Ports
-					.A(aa),						// 34-bit input: A data
+					.A({3'b0, aa}),	// 30-bit input: A data
+					.B(bb),			// 18-bit input: B data
+					.C('x),			// 48-bit input: C data
+					.CARRYIN('0),	// 1-bit input: Carry-in
+					.D(dd),			// 27-bit input: D data
+
+					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
+					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
+					.CEAD(en),			// 1-bit input: Clock enable for ADREG
+					.CEALUMODE('0),		// 1-bit input: Clock enable for ALUMODE
+					.CEB1('0),			// 1-bit input: Clock enable for 1st stage BREG
+					.CEB2(en),			// 1-bit input: Clock enable for 2nd stage BREG
+					.CEC('0),			// 1-bit input: Clock enable for CREG
+					.CECARRYIN('0),		// 1-bit input: Clock enable for CARRYINREG
+					.CECTRL(en),		// 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+					.CED('0),			// 1-bit input: Clock enable for DREG
+					.CEINMODE('0),		// 1-bit input: Clock enable for INMODEREG
+					.CEM(en),			// 1-bit input: Clock enable for MREG
+					.CEP(en),			// 1-bit input: Clock enable for PREG
+					.RSTA('0),			// 1-bit input: Reset for AREG
+					.RSTB(				// 1-bit input: Reset for BREG
+// synthesis translate_off
+						rst ||
+// synthesis translate_on
+						zero
+					),
+					.RSTC('0),			// 1-bit input: Reset for CREG
+					.RSTD(				// 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+						zero ||
+// synthesis translate_on
+						rst
+					),
+					.RSTALLCARRYIN('0),	// 1-bit input: Reset for CARRYINREG
+					.RSTALUMODE('0),	// 1-bit input: Reset for ALUMODEREG
+					.RSTCTRL('0),		// 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+					.RSTINMODE('0),		// 1-bit input: Reset for INMODE register
+					.RSTM(rst),			// 1-bit input: Reset for MREG
+					.RSTP(rst)			// 1-bit input: Reset for PREG
+				);
+				DSP58: DSP58 #(
+					// Feature Control Attributes: Data Path Selection
+					.AMULTSEL("AD"),		// Selects A input to multiplier (A, AD)
+					.A_INPUT("DIRECT"),		// Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+					.BMULTSEL("B"),			// Selects B input to multiplier (AD, B)
+					.B_INPUT("DIRECT"),		// Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+					.DSP_MODE("INT24"),
+					.PREADDINSEL("A"),			// Selects input to pre-adder (A, B)
+					.RND('0),					// Rounding Constant
+					.USE_MULT("MULTIPLY"),		// Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+					.USE_SIMD("ONE58"),			// SIMD selection (FOUR12, ONE58, TWO24)
+					.USE_WIDEXOR("FALSE"),		// Use the Wide XOR function (FALSE, TRUE)
+					.XORSIMD("XOR24_34_58_116"),// Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+
+					// Pattern Detector Attributes: Pattern Detection Configuration
+					.AUTORESET_PATDET("NO_RESET"),		// NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+					.AUTORESET_PRIORITY("RESET"),		// Priority of AUTORESET vs. CEP (CEP, RESET).
+					.MASK('1),							// 58-bit mask value for pattern detect (1=ignore)
+					.PATTERN('0),						// 58-bit pattern match for pattern detect
+					.SEL_MASK("MASK"),					// C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+					.SEL_PATTERN("PATTERN"),			// Select pattern value (C, PATTERN)
+					.USE_PATTERN_DETECT("NO_PATDET"),	// Enable pattern detect (NO_PATDET, PATDET)
+
+					// Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+					.IS_ALUMODE_INVERTED('0),							// Optional inversion for ALUMODE
+					.IS_CARRYIN_INVERTED('0),							// Optional inversion for CARRYIN
+					.IS_CLK_INVERTED('0),								// Optional inversion for CLK
+					.IS_INMODE_INVERTED('0),							// Optional inversion for INMODE
+					.IS_NEGATE_INVERTED('0),							// Optional inversion for NEGATE
+					.IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}),	// Optional inversion for OPMODE
+					.IS_RSTALLCARRYIN_INVERTED('0),						// Optional inversion for RSTALLCARRYIN
+					.IS_RSTALUMODE_INVERTED('0),						// Optional inversion for RSTALUMODE
+					.IS_RSTA_INVERTED('0),								// Optional inversion for RSTA
+					.IS_RSTB_INVERTED('0),								// Optional inversion for RSTB
+					.IS_RSTCTRL_INVERTED('0),							// Optional inversion for STCONJUGATE_A
+					.IS_RSTC_INVERTED('0),								// Optional inversion for RSTC
+					.IS_RSTD_INVERTED('0),								// Optional inversion for RSTD
+					.IS_RSTINMODE_INVERTED('0),							// Optional inversion for RSTINMODE
+					.IS_RSTM_INVERTED('0),								// Optional inversion for RSTM
+					.IS_RSTP_INVERTED('0),								// Optional inversion for RSTP
+
+					// Register Control Attributes: Pipeline Register Configuration
+					.ACASCREG(0),		// Number of pipeline stages between A/ACIN and ACOUT (0-2)
+					.ADREG(1),			// Pipeline stages for pre-adder (0-1)
+					.ALUMODEREG(0),		// Pipeline stages for ALUMODE (0-1)
+					.AREG(0),			// Pipeline stages for A (0-2)
+					.BCASCREG(1),		// Number of pipeline stages between B/BCIN and BCOUT (0-2)
+					.BREG(1),			// Pipeline stages for B (0-2)
+					.CARRYINREG(0),		// Pipeline stages for CARRYIN (0-1)
+					.CARRYINSELREG(0),	// Pipeline stages for CARRYINSEL (0-1)
+					.CREG(0),			// Pipeline stages for C (0-1)
+					.DREG(0),			// Pipeline stages for D (0-1)
+					.INMODEREG(0),		// Pipeline stages for INMODE (0-1)
+					.MREG(1),			// Multiplier pipeline stages (0-1)
+					.OPMODEREG(1),		// Pipeline stages for OPMODE (0-1)
+					.PREG(1),			// Number of pipeline stages for P (0-1)
+					.RESET_MODE("SYNC")	// Selection of synchronous or asynchronous reset. (ASYNC, SYNC)
+				) dsp (
+					// Cascade outputs: Cascade Ports
+					.ACOUT(),			// 34-bit output: A port cascade
+					.BCOUT(),			// 24-bit output: B cascade
+					.CARRYCASCOUT(),	// 1-bit output: Cascade carry
+					.MULTSIGNOUT(),		// 1-bit output: Multiplier sign cascade
+					.PCOUT(),			// 58-bit output: Cascade output
+
+					// Control outputs: Control Inputs/Status Bits
+					.OVERFLOW(),		// 1-bit output: Overflow in add/acc
+					.PATTERNBDETECT(),	// 1-bit output: Pattern bar detect
+					.PATTERNDETECT(),	// 1-bit output: Pattern detect
+					.UNDERFLOW(),		// 1-bit output: Underflow in add/acc
+
+					// Data outputs: Data Ports
+					.CARRYOUT(),		// 4-bit output: Carry
+					.P(pp),				// 58-bit output: Primary data
+					.XOROUT(),			// 8-bit output: XOR data
+
+					// Cascade inputs: Cascade Ports
+					.ACIN('x),			// 34-bit input: A cascade data
+					.BCIN('x),			// 24-bit input: B cascade
+					.CARRYCASCIN('x),	// 1-bit input: Cascade carry
+					.MULTSIGNIN('x),	// 1-bit input: Multiplier sign cascade
+					.PCIN('x),			// 58-bit input: P cascade
+
+					// Control inputs: Control Inputs/Status Bits
+					.CLK(clk),					// 1-bit input: Clock
+					.ALUMODE(4'h0),				// 4-bit input: ALU control
+					.CARRYINSEL('0),			// 3-bit input: Carry select
+					.INMODE(5'b01100),			// 5-bit input: INMODE control
+					.NEGATE('0),				// 3-bit input: Negates the input of the multiplier
+					.OPMODE({ 2'b00, opmode }),	// 9-bit input: Operation mode
+
+					// Data inputs: Data Ports
+					.A({7'b0, aa}),				// 34-bit input: A data
 					.B(bb),						// 24-bit input: B data
 					.C('x),						// 58-bit input: C data
 					.CARRYIN('0),				// 1-bit input: Carry-in
 					.D(dd),						// 27-bit input: D data
 
 					// Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+					.ASYNC_RST('0),		// 1-bit input: Asynchronous reset for all registers
 					.CEA1('0),			// 1-bit input: Clock enable for 1st stage AREG
 					.CEA2('0),			// 1-bit input: Clock enable for 2nd stage AREG
 					.CEAD(en),			// 1-bit input: Clock enable for ADREG
@@ -478,7 +673,7 @@ module mvu_4sx4u #(
 					.RSTP(rst)			// 1-bit input: Reset for PREG
 				);
 				default: initial begin
-					$error("Unknown version DSP48E%0d.", VERSION);
+					$error("Unknown DSP version.");
 					$finish;
 				end
 				endcase
@@ -508,33 +703,54 @@ module mvu_4sx4u #(
 			for(genvar  i = 0; i < 3; i++) begin
 				assign	h3[s][i] = pp[OFFSETS[i+1]+:2] - X3[i+1];
 			end
+			// Overflow out of high lane
+			logic  PZ = 0;
+			always_ff @(posedge clk) begin
+				if(rst)      PZ <= 0;
+				else if(en)  PZ <= L[3]? 0 : pp[$left(pp)];
+			end
+			assign	h3[s][3] =
+				( PZ && !pp[$left(pp)-:2])? +1 :
+				(!PZ && &pp[$left(pp)-:2])? -1 : 0;
+
 			assign	p3[s] = pp;
 
 		end : genSIMD
 
-		// Stage #4: Cross-SIMD Reduction
+		// Stage #4: Potentially Multiple Cycles of Cross-SIMD Reduction
+		// - binary reduction trees with SIMD leaf nodes for both the core lane outputs and the spill accumulation
+		// - balanced tree construction with all fully occupied levels pipelined
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
+		localparam leaf_load_t   LEAF_LOAD = SIMD > 1 ? init_leaf_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leaf_loads ends up in infinite loop
+		localparam int unsigned  HI_NODE_REGISTERED = 2**($clog2(SIMD+1)-1)-2;
 
-		uwire signed [ACCU_WIDTH-1:0]  up4;
-		uwire signed [             HI_WIDTH_MAX-1:0]  hi4[3];
-		uwire        [$clog2(SIMD)+LO_WIDTH_MAX-1:0]  lo4[3];
-		for(genvar  i = 0; i < 4; i++) begin
+		uwire signed [HI_WIDTH_MAX-1:0]  hi4[4];
+		uwire        [LO_WIDTH_MAX-1:0]  lo4[4];
+		for(genvar  i = 0; i < 4; i++) begin : genLanes
 
 			// Conclusive high part accumulation
-			if(i < 3) begin : genHi
-				if(i < PE_REM)  assign  hi4[i] = '0;
+			if(i < PE_REM)  assign  hi4[i] = 0;
+			else begin : genHi
+				localparam int unsigned  HI_WIDTH = hi_width(i);
+				if(HI_WIDTH == 0)  assign  hi4[i] = 0;
 				else begin
-					localparam int unsigned  HI_WIDTH = hi_width(i);
-
 					// Adder Tree across all SIMD high contributions, each from [-1:1]
 					uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0]  tree;
 					for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
 					for(genvar  n = 0; n < SIMD-1; n++) begin
 						// Sum truncated to actual maximum bit width at this node
-						uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
-						assign  tree[n] = s;
+						typedef logic signed [$clog2(1+LEAF_LOAD[n]):0]  sum_t;
+						uwire sum_t  s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
+						if((0 < n) && (n <= HI_NODE_REGISTERED)) begin
+							sum_t  S = 0;
+							always_ff @(posedge clk) begin
+								if(rst)      S <= 0;
+								else if(en)  S <= s;
+							end
+							assign	tree[n] = S;
+						end
+						else  assign  tree[n] = s;
 					end
 
 					// High Sideband Accumulation
@@ -542,16 +758,15 @@ module mvu_4sx4u #(
 					always_ff @(posedge clk) begin
 						if(rst)      Hi4 <= 0;
 						else if(en) begin
-							automatic logic signed [HI_WIDTH:0]  h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]);
-							assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin
-								$error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH);
+							automatic logic signed [HI_WIDTH:0]  h = $signed(L[PIPELINE_DEPTH-1]? {(HI_WIDTH){1'b0}} : Hi4) + $signed(tree[0]);
+							assert(h[HI_WIDTH] === h[HI_WIDTH-1]) else begin
+								$error("%m [%0d:%0d]: Accumulation overflow for ACCU_WIDTH=%0d", c, i, ACCU_WIDTH);
 								$stop;
 							end
-							Hi4 <= h;
+							Hi4 <= h[HI_WIDTH-1:0];
 						end
 					end
 					assign	hi4[i] = Hi4;
-
 				end
 			end : genHi
 
@@ -561,34 +776,47 @@ module mvu_4sx4u #(
 				localparam int unsigned  LO_WIDTH = lo_width(i);
 
 				// Adder Tree across all SIMD low contributions
-				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				localparam int unsigned  ROOT_WIDTH = sum_width(SIMD, LO_WIDTH);
 				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
-				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH];
-				for(genvar  n = 0; n < SIMD-1; n++) begin
-					// Sum truncated to actual maximum bit width at this node
-					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
-					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
-					assign  tree[n] = s;
-				end
 
-				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      Lo4 <= 0;
-					else if(en)  Lo4 <= tree[0];
-				end
+				if(SIMD == 1) begin : genReg
+					// Just slide in a balancing register
+					logic [ROOT_WIDTH-1:0]  R = 'x;
+					always_ff @(posedge clk) begin
+						if(rst)      R <= 'x;
+						else if(en)  R <= p3[0][OFFSETS[i]+:LO_WIDTH];
+					end
+					assign	tree[0] = R;
+				end : genReg
+				else begin : genTree
+					for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH];
+					for(genvar  n = 0; n < SIMD-1; n++) begin
+						// Sum truncated to actual maximum bit width at this node
+						localparam int unsigned  NODE_WIDTH = sum_width(LEAF_LOAD[n], LO_WIDTH);
+						uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+						if(n <= HI_NODE_REGISTERED) begin
+							logic [NODE_WIDTH-1:0]  S = 'x;
+							always_ff @(posedge clk) begin
+								if(rst)      S <= 'x;
+								else if(en)  S <= s;
+							end
+							assign	tree[n] = S;
+						end
+						else  assign  tree[n] = s;
+					end
+				end : genTree
 
-				if(i == 3)  assign  up4 = Lo4;
-				else  assign  lo4[i] = Lo4;
+				assign  lo4[i] = tree[0];
 			end : genLo
 
-		end
+		end : genLanes
 
 		// Stage #5: Resolve lane totals
-		logic signed [3:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
+		logic signed [3:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 'x };
 		always_ff @(posedge clk) begin
-			if(rst)  Res5 <= '{ default: 0 };
+			if(rst)  Res5 <= '{ default: 'x };
 			else if(en) begin
-				Res5[3] <= up4 - hi4[2];
+				Res5[3] <= $signed({ hi4[3], {(lo_width(3)){1'b0}} }) + $signed({ 1'b0, lo4[3] }) - hi4[2];
 				Res5[2] <= $signed({ hi4[2], {(lo_width(2)){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
 				Res5[1] <= $signed({ hi4[1], {(lo_width(1)){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
 				Res5[0] <= $signed({ hi4[0], {(lo_width(0)){1'b0}} }) + $signed({ 1'b0, lo4[0] });
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index fbe85b624f..96652893a2 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -34,9 +34,9 @@
 module mvu_8sx8u_dsp48 #(
 	int unsigned  PE,
 	int unsigned  SIMD,
-	int unsigned  WEIGHT_WIDTH,
-	int unsigned  ACTIVATION_WIDTH,
 	int unsigned  ACCU_WIDTH,
+	int unsigned  ACTIVATION_WIDTH,
+	int unsigned  WEIGHT_WIDTH,
 
 	int unsigned  VERSION = 1,
 	bit  SIGNED_ACTIVATIONS = 0,
@@ -57,8 +57,6 @@ module mvu_8sx8u_dsp48 #(
 	output	logic  vld,
 	output	logic signed [PE-1:0][ACCU_WIDTH-1:0]  p
 );
-	import  mvu_pkg::*;
-
 	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
@@ -66,36 +64,115 @@ module mvu_8sx8u_dsp48 #(
 `endif
 		FORCE_BEHAVIORAL;
 
-	typedef int unsigned  leave_load_t[2*SIMD-1];
-	function leave_load_t init_leave_loads();
-		automatic leave_load_t  res;
-		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
-		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+	//-----------------------------------------------------------------------
+	// Startup Recovery Watchdog
+	//  The DSP slice needs 100ns of recovery time after initial startup before
+	//  being able to ingest input properly. This watchdog discovers violating
+	//  stimuli during simulation and produces a corresponding warning.
+	if(1) begin : blkRecoveryWatch
+		logic  Dirty = 1;
+		initial begin
+			#100ns;
+			Dirty <= 0;
+		end
+
+		always_ff @(posedge clk) begin
+			assert(!Dirty || rst || !en || zero) else begin
+				$warning("%m: Feeding input during DSP startup recovery. Expect functional errors.");
+			end
+		end
+	end : blkRecoveryWatch
+
+	//-----------------------------------------------------------------------
+	// Determine version-specific constraints
+	typedef enum { DSP48E1 = 1, DSP48E2 = 2 }  dsp_version_e;
+	localparam int unsigned  A_WIDTH = 25 + 2*(VERSION > 1);	// Width of A datapath
+	localparam int unsigned  B_WIDTH = 18;	// Width of B datapath
+	localparam int unsigned  P_WIDTH = 48;	// Width of P datapath
+
+	/**
+	 * Lane Slicing
+	 *	Assumptions:
+	 *	 - Internal lane widths differ, at most, by a single bit.
+	 *	 - The minimum lane width is WEIGHT_WIDTH + ACTIVATION_WIDTH - 1 bits
+	 *	   so as to confine cross-lane overflows to {-1,0,1}.
+	 *	 - The rightmost lane (#0) has the maximum internal width.
+	 *	 - The leftmost lane (#3) extends into the wide DSP accumulation path and
+	 *	   is typically constrained by ACCU_WIDTH rather than the next lane. If so,
+	 *	   it doesn't have an external high extension.
+	 *	 - The one but leftmost lane (#2) has the minimum internal width and, hence,
+	 *	   the maximum external high extension.
+	 */
+	typedef int unsigned  lane_offset_v[2:0];
+	function lane_offset_v sliceLanes();
+		localparam int unsigned  MIN_LANE_WIDTH = WEIGHT_WIDTH + ACTIVATION_WIDTH - 1;
+		automatic lane_offset_v  res;
+
+		// Determine number of bits beyond accommodating minimum lane width
+		automatic int  bit_slack = A_WIDTH;
+		// protect sign, leftmost weight entry, minimum for rest of lanes
+		bit_slack -= 1 + WEIGHT_WIDTH + MIN_LANE_WIDTH;
+		if(bit_slack < 0) begin
+			localparam dsp_version_e  VER = dsp_version_e'(VERSION);
+			$error(
+				"%m: Cannot accommodate %0d-bit weights and %0d-bit activations on %s.",
+				WEIGHT_WIDTH, ACTIVATION_WIDTH, VER.name
+			);
+			$finish;
+		end
+
+		// Distribute slack bits preferring right lane
+		res[0] = 0;
+		res[1] = MIN_LANE_WIDTH + bit_slack;
+
+		// Last lane bounded by the smaller of ACCU_WIDTH or P datapath
+		res[2] = res[1] + ACCU_WIDTH;
+		if(res[2] > P_WIDTH)  res[2] = P_WIDTH;
+
 		return  res;
-	endfunction : init_leave_loads
+	endfunction : sliceLanes
+	localparam lane_offset_v  OFFSETS = sliceLanes();
 
 	function int unsigned sum_width(input int unsigned  n, input int unsigned  w);
-		return	w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n);
+		return  w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n);
 	endfunction : sum_width
+	function int unsigned lo_width(input int unsigned  i);
+		return  OFFSETS[i+1] - OFFSETS[i];
+	endfunction : lo_width
+	function int unsigned hi_width(input int unsigned  i);
+		automatic int unsigned  lw = lo_width(i);
+		return	ACCU_WIDTH <= lw?
+			0 :
+			1 + ($clog2(SIMD) < ACCU_WIDTH-lw?
+					ACCU_WIDTH-lw :
+					$clog2(2**(ACCU_WIDTH-lw-1)+SIMD)
+				);
+	endfunction : hi_width
+	localparam int unsigned  LO_WIDTH_MAX = lo_width(1);
+	localparam int unsigned  HI_WIDTH_MAX = hi_width(0);
+
+
+	typedef int unsigned  leaf_load_t[2*SIMD-1];
+	function leaf_load_t init_leaf_loads();
+		automatic leaf_load_t  res;
+		for(int  i = 2*(SIMD-1); i >= int'(SIMD)-1; i--)  res[i] = 1;
+		for(int  i = SIMD-2; i >= 0; i--)  res[i] = res[2*i+1] + res[2*i+2];
+		return  res;
+	endfunction : init_leaf_loads
 
 	// Pipeline for last indicator flag
-	logic [1:5] L = '0;
+	// Depth: 3 cycles for DSP + external SIMD reduction
+	localparam int unsigned  PIPELINE_DEPTH = 3 + $clog2(SIMD+1) + (SIMD == 1);
+/* verilator lint_off LITENDIAN */
+	logic [1:PIPELINE_DEPTH] L = '0;
+/* verilator lint_on LITENDIAN */
 	always_ff @(posedge clk) begin
 		if(rst)      L <= '0;
-		else if(en)  L <= { last, L[1:4] };
-	end
-	assign	vld = L[5];
-	initial begin
-		if(mvu_pipeline_depth("mvu_8sx8u_dsp48") < $bits(L)) begin
-			$error("%m: Outdated pipeline depth computation.");
-			$stop;
-		end
+		else if(en)  L <= { last, L[1:PIPELINE_DEPTH-1] };
 	end
+	assign	vld = L[PIPELINE_DEPTH];
 
 	// Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
-	localparam int unsigned  SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH;
-	localparam int unsigned  D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
-
 	localparam int unsigned  PIPE_COUNT = (PE+1)/2;
 	for(genvar  c = 0; c < PIPE_COUNT; c++) begin : genPipes
 
@@ -103,17 +180,17 @@ module mvu_8sx8u_dsp48 #(
 		localparam int unsigned  PE_END = PE < 2*(c+1)? PE : 2*(c+1);
 		localparam int unsigned  PE_REM = 2*(c+1) - PE_END;
 
-		uwire        [47:0]  p3[SIMD];
-		uwire signed [ 1:0]  h3[SIMD];
+		uwire        [P_WIDTH-1:0]  p3[SIMD];
+		uwire signed [        1:0]  h3[SIMD][2];
 		for(genvar  s = 0; s < SIMD; s++) begin : genSIMD
 
 			// Input Lane Assembly
-			uwire [17:0]  bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
-			logic [29:0]  aa;
-			logic [26:0]  dd;
-			logic [ 1:0]  xx;
+			uwire [B_WIDTH-1:0]  bb = { {(B_WIDTH-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
+			logic [A_WIDTH-1:0]  aa;
+			logic [A_WIDTH-1:0]  dd;
+			logic [1:0]  xx;
 			if(1) begin : blkVectorize
-				uwire [WEIGHT_WIDTH-1:0]  ww[PE_END - PE_BEG];
+				uwire signed [WEIGHT_WIDTH-1:0]  ww[PE_END - PE_BEG];
 				for(genvar  pe = 0; pe < PE_END - PE_BEG; pe++) begin
 					assign	ww[pe] = w[PE_BEG + pe][s];
 					if(pe) begin
@@ -138,33 +215,33 @@ module mvu_8sx8u_dsp48 #(
 					dd = '0;
 					aa = '0;
 					for(int unsigned  pe = 0; pe < PE_END - PE_BEG; pe++) begin
-						dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe];
-						aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+						dd[OFFSETS[pe + PE_REM]+:WEIGHT_WIDTH-1] = ww[pe][0+:WEIGHT_WIDTH-1];
+						aa[OFFSETS[pe + PE_REM]+ WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
 					end
 				end
 			end : blkVectorize
 
-			uwire [47:0]  pp;
+			uwire [P_WIDTH-1:0]  pp;
 
 			// Note: Since the product B * AD is computed,
 			//       rst can be only applied to AD and zero only to B
 			//       with the same effect as zeroing both.
 			if(BEHAVIORAL) begin : genBehav
 				// Stage #1: Input Refine
-				logic signed [17:0]  B1  = 0;
+				logic signed [B_WIDTH-1:0]  B1  = 0;
 				always_ff @(posedge clk) begin
 					if(zero)     B1  <= 0;
 					else if(en)  B1  <= bb;
 				end
 
-				logic signed [26:0]  AD1 = 0;
+				logic signed [A_WIDTH-1:0]  AD1 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      AD1 <= 0;
 					else if(en)  AD1 <= dd - aa;
 				end
 
 				// Stage #2: Multiply
-				logic signed [45:0]  M2 = 0;
+				logic signed [A_WIDTH+B_WIDTH-1:0]  M2 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      M2 <= 0;
 					else if(en)  M2 <=
@@ -175,7 +252,7 @@ module mvu_8sx8u_dsp48 #(
 				end
 
 				// Stage #3: Accumulate
-				logic signed [47:0]  P3 = 0;
+				logic signed [P_WIDTH-1:0]  P3 = 0;
 				always_ff @(posedge clk) begin
 					if(rst)      P3 <= 0;
 					else if(en)  P3 <= M2 + (L[3]? 0 : P3);
@@ -443,111 +520,127 @@ module mvu_8sx8u_dsp48 #(
 				else if(en) begin
 					X1 <= xx;
 					X2 <= X1;
-					X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]);
+					X3 <= X2 + (L[3]? 2'h0 : pp[OFFSETS[1]+:2]);
 				end
 			end
 
-			// Derive actual cross-lane overflows
-			assign  h3[s] = pp[D[1]+:2] - X3;
+			// Derive actual cross-lane overflow
+			assign  h3[s][0] = pp[OFFSETS[1]+:2] - X3;
+			// Overflow out of high lane
+			logic  PZ = 0;
+			always_ff @(posedge clk) begin
+				if(rst)      PZ <= 0;
+				else if(en)  PZ <= L[3]? 0 : pp[$left(pp)];
+			end
+			assign	h3[s][1] =
+				( PZ && !pp[$left(pp)-:2])? +1 :
+				(!PZ && &pp[$left(pp)-:2])? -1 : 0;
 
 			assign	p3[s] = pp;
 
 		end : genSIMD
 
-		// Stage #4: Cross-SIMD Reduction
+		// Stage #4: Potentially Multiple Cycles of Cross-SIMD Reduction
+		// - binary reduction trees with SIMD leaf nodes for both the core lane outputs and the spill accumulation
+		// - balanced tree construction with all fully occupied levels pipelined
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
-
-		// Range of Cross-lane Contribution Tracked in Hi4
-		/*
-		 * - Assumption: ACCU_WIDTH bounds right lane value at any point in time.
-		 * - The value x beyond the lane boundary is hence bounded by:
-		 *		-2^(w-1) <= x <= 2^(w-1)-1    with w = ACCU_WIDTH - D[1]
-		 * - This value decomposes into the tracked overflow h and the overflow l
-		 *   from the low SIMD lane reduction with:
-		 *		0 <= l <= SIMD
-		 * - From x = l + h follows:
-		 *		h = x - l
-		 *		-2^(w-1) - SIMD <= h <= 2^(w-1)-1
-		 * - This required bit width of the two's complement representation of this
-		 *   signed value is determined by its lower bound to be at least:
-		 *		1 + $clog2(2^(w-1)+SIMD)
-		 */
-		localparam int unsigned  HI_WIDTH = 1 + ($clog2(SIMD) < ACCU_WIDTH-D[1]? ACCU_WIDTH-D[1] : $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD));
-
-		uwire signed [ACCU_WIDTH       -1:0]  up4;
-		uwire signed [HI_WIDTH         -1:0]  hi4;
-		uwire        [$clog2(SIMD)+D[1]-1:0]  lo4;
-
-		// Conclusive high part accumulation
-		if(PE_REM == 0) begin : genHi
-
-			// Adder Tree across all SIMD high contributions, each from [-1:1]
-			uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0]  tree;
-			for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s];
-			for(genvar  n = 0; n < SIMD-1; n++) begin
-				// Sum truncated to actual maximum bit width at this node
-				uwire signed [$clog2(1+LEAVE_LOAD[n]):0]  s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
-				assign  tree[n] = s;
-			end
+		localparam leaf_load_t   LEAF_LOAD = SIMD > 1 ? init_leaf_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leaf_loads ends up in infinite loop
+		localparam int unsigned  HI_NODE_REGISTERED = 2**($clog2(SIMD+1)-1)-2;
+
+		uwire signed [HI_WIDTH_MAX-1:0]  hi4[2];
+		uwire        [LO_WIDTH_MAX-1:0]  lo4[2];
+		for(genvar  i = 0; i < 2; i++) begin : genLanes
+
+			// Conclusive high part accumulation
+			if(i < PE_REM)  assign  hi4[i] = 0;
+			else begin : genHi
+				localparam int unsigned  HI_WIDTH = hi_width(i);
+				if(HI_WIDTH == 0)  assign  hi4[i] = 0;
+				else begin
+					// Adder Tree across all SIMD high contributions, each from [-1:1]
+					uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0]  tree;
+					for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = h3[s][i];
+					for(genvar  n = 0; n < SIMD-1; n++) begin
+						// Sum truncated to actual maximum bit width at this node
+						typedef logic signed [$clog2(1+LEAF_LOAD[n]):0]  sum_t;
+						uwire sum_t  s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
+						if((0 < n) && (n <= HI_NODE_REGISTERED)) begin
+							sum_t  S = 0;
+							always_ff @(posedge clk) begin
+								if(rst)      S <= 0;
+								else if(en)  S <= s;
+							end
+							assign	tree[n] = S;
+						end
+						else  assign  tree[n] = s;
+					end
 
-			// High Sideband Accumulation
-			logic signed [HI_WIDTH-1:0]  Hi4 = 0;
-			always_ff @(posedge clk) begin
-				if(rst)  Hi4 <= 0;
-				else if(en) begin
-					automatic logic signed [HI_WIDTH:0]  h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]);
-					assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin
-						$error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH);
-						$stop;
+					// High Sideband Accumulation
+					logic signed [HI_WIDTH-1:0]  Hi4 = 0;
+					always_ff @(posedge clk) begin
+						if(rst)      Hi4 <= 0;
+						else if(en) begin
+							automatic logic signed [HI_WIDTH:0]  h = $signed(L[PIPELINE_DEPTH-1]? {(HI_WIDTH){1'b0}} : Hi4) + $signed(tree[0]);
+							assert(h[HI_WIDTH] === h[HI_WIDTH-1]) else begin
+								$error("%m [%0d:%0d]: Accumulation overflow for ACCU_WIDTH=%0d", c, i, ACCU_WIDTH);
+								$stop;
+							end
+							Hi4 <= h[HI_WIDTH-1:0];
+						end
 					end
-					Hi4 <= h;
+					assign	hi4[i] = Hi4;
 				end
-			end
-			assign	hi4 = Hi4;
-		end : genHi
-		else begin : genHiZero
-			assign hi4 = '0;
-		end : genHiZero
-
-		for(genvar  i = 0; i < 2; i++) begin
-			localparam int unsigned  LO_WIDTH = D[i+1] - D[i];
-			// Conclusive low part accumulation
-			if(i >= PE_REM) begin : blkLo
-				// Adder Tree across all SIMD low contributions (all unsigned arithmetic)
+			end : genHi
+
+			// Conclusive low part accumulation (all unsigned arithmetic)
+			if(i < PE_REM)  assign  lo4[i] = '0;
+			else begin : genLo
+				localparam int unsigned  LO_WIDTH = lo_width(i);
+
+				// Adder Tree across all SIMD low contributions
 				localparam int unsigned  ROOT_WIDTH = sum_width(SIMD, LO_WIDTH);
 				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
-				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
-				for(genvar  n = 0; n < SIMD-1; n++) begin
-					// Sum truncated to actual maximum bit width at this node
-					localparam int unsigned  NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH);
-					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
-					assign  tree[n] = s;
-				end
 
-				logic [ROOT_WIDTH-1:0]  Lo4 = 0;
-				always_ff @(posedge clk) begin
-					if(rst)      Lo4 <= 0;
-					else if(en)  Lo4 <= tree[0];
-				end
+				if(SIMD == 1) begin : genReg
+					// Just slide in a balancing register
+					logic [ROOT_WIDTH-1:0]  R = 'x;
+					always_ff @(posedge clk) begin
+						if(rst)      R <= 'x;
+						else if(en)  R <= p3[0][OFFSETS[i]+:LO_WIDTH];
+					end
+					assign	tree[0] = R;
+				end : genReg
+				else begin : genTree
+					for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH];
+					for(genvar  n = 0; n < SIMD-1; n++) begin
+						// Sum truncated to actual maximum bit width at this node
+						localparam int unsigned  NODE_WIDTH = sum_width(LEAF_LOAD[n], LO_WIDTH);
+						uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
+						if(n <= HI_NODE_REGISTERED) begin
+							logic [NODE_WIDTH-1:0]  S = 'x;
+							always_ff @(posedge clk) begin
+								if(rst)      S <= 'x;
+								else if(en)  S <= s;
+							end
+							assign	tree[n] = S;
+						end
+						else  assign  tree[n] = s;
+					end
+				end : genTree
 
-				if(i == 1)  assign  up4 = Lo4;
-				else  assign  lo4 = Lo4;
-			end : blkLo
-			else begin : blkLoZero
-				assign lo4 = '0;
-			end : blkLoZero
+				assign  lo4[i] = tree[0];
+			end : genLo
 
-		end
+		end : genLanes
 
 		// Stage #5: Resolve lane totals
-		logic signed [1:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 0 };
+		logic signed [1:0][ACCU_WIDTH-1:0]  Res5 = '{ default: 'x };
 		always_ff @(posedge clk) begin
-			if(rst)  Res5 <= '{ default: 0 };
+			if(rst)  Res5 <= '{ default: 'x };
 			else if(en) begin
-				Res5[1] <= up4 - hi4;
-				Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 });
+				Res5[1] <= $signed({ hi4[1], {(lo_width(1)){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
+				Res5[0] <= $signed({ hi4[0], {(lo_width(0)){1'b0}} }) + $signed({ 1'b0, lo4[0] });
 			end
 		end
 
diff --git a/finn-rtllib/mvu/mvu_pkg.sv b/finn-rtllib/mvu/mvu_pkg.sv
deleted file mode 100644
index fa613b2aae..0000000000
--- a/finn-rtllib/mvu/mvu_pkg.sv
+++ /dev/null
@@ -1,22 +0,0 @@
-package mvu_pkg;
-	function int unsigned mvu_pipeline_depth(
-		input string        core,
-		input int unsigned  simd   = 0,
-		input int unsigned  seglen = 0
-	);
-		unique case(core)
-		"mvu_vvu_8sx9_dsp58": begin
-			automatic int  chainlen = (simd+2)/3;
-			if(seglen == 0)  seglen = chainlen;
-			return  3 + (chainlen-1)/seglen;
-		end
-		"mvu_4sx4u", "mvu_4sx4u_dsp48e1", "mvu_4sx4u_dsp48e2",
-		"mvu_8sx8u_dsp48":
-			return  5;
-		default: begin
-			$error("Unknown MVU core '%s'", core);
-			$finish;
-		end
-		endcase
-	endfunction : mvu_pipeline_depth
-endpackage : mvu_pkg
diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
index 7515b2e868..11bf4e9ccd 100644
--- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -61,8 +61,6 @@ module mvu_vvu_8sx9_dsp58 #(
 	output  logic vld,
     output  logic [PE-1:0][ACCU_WIDTH-1:0] p
   );
-	import  mvu_pkg::*;
-
 	// for verilator always use behavioral code
 	localparam bit  BEHAVIORAL =
 `ifdef VERILATOR
@@ -70,6 +68,25 @@ module mvu_vvu_8sx9_dsp58 #(
 `endif
 		FORCE_BEHAVIORAL;
 
+	//-----------------------------------------------------------------------
+	// Startup Recovery Watchdog
+	//  The DSP slice needs 100ns of recovery time after initial startup before
+	//  being able to ingest input properly. This watchdog discovers violating
+	//  stimuli during simulation and produces a corresponding warning.
+	if(1) begin : blkRecoveryWatch
+		logic  Dirty = 1;
+		initial begin
+			#100ns;
+			Dirty <= 0;
+		end
+
+		always_ff @(posedge clk) begin
+			assert(!Dirty || rst || !en || zero) else begin
+				$warning("%m: Feeding input during DSP startup recovery. Expect functional errors.");
+			end
+		end
+	end : blkRecoveryWatch
+
 //-------------------- Declare global signals --------------------\\
 	localparam int unsigned CHAINLEN = (SIMD+2)/3;
 	localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
@@ -91,12 +108,6 @@ module mvu_vvu_8sx9_dsp58 #(
 		end
 	end
 	assign vld = L[0];
-	initial begin
-		if(mvu_pipeline_depth("mvu_vvu_8sx9_dsp58", SIMD, SEGMENTLEN) < $bits(L)) begin
-			$error("%m: Outdated pipeline depth computation.");
-			$stop;
-		end
-	end
 
 //-------------------- Shift register for ZERO flag --------------------\\
 	logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
index 368c338f37..f8511cd7a4 100644
--- a/finn-rtllib/mvu/mvu_vvu_axi.sv
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -91,7 +91,6 @@ module mvu_vvu_axi #(
 	output	logic  m_axis_output_tvalid,
 	input	logic  m_axis_output_tready
 );
-	import  mvu_pkg::*;
 
 //-------------------- Parameter sanity checks --------------------\\
 	initial begin
@@ -346,10 +345,12 @@ module mvu_vvu_axi #(
 	end : blkDsp
 
 	if(1) begin : blkOutput
-		localparam int unsigned  PIPELINE_DEPTH = mvu_pipeline_depth(COMPUTE_CORE, SIMD, SEGMENTLEN);
-		localparam int unsigned  MIN_OUT_PERIOD = MW / SIMD;
-		localparam int unsigned  MAX_IN_FLIGHT  = 1 + PIPELINE_DEPTH / MIN_OUT_PERIOD;
+		localparam int unsigned  CORE_PIPELINE_DEPTH =
+			COMPUTE_CORE == "dotp_8sx9_dsp58"? 3 + (SEGMENTLEN == 0? 0 : ((SIMD+2)/3 -1)/SEGMENTLEN) :
+			/* else */                         3 + $clog2(SIMD+1) + (SIMD == 1);
 
+		// This is conservative and could be divided by a guaranteed minimum output interval, e.g. MW/SIMD.
+		localparam int unsigned  MAX_IN_FLIGHT = CORE_PIPELINE_DEPTH;
 		typedef logic [PE-1:0][ACCU_WIDTH-1:0]  output_t;
 
 		logic signed [$clog2(MAX_IN_FLIGHT+1):0]  OPtr = '1;	// -1 | 0, 1, ..., MAX_IN_FLIGHT