From 3250913b9cbce020bb543b05f592656a877a5828 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Thu, 23 Jan 2025 19:53:10 +0000 Subject: [PATCH 1/4] Avoid clock assignment for reliably simulation. --- finn-rtllib/mvu/mvu_vvu_axi.sv | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 0ee84b2f79..cdc770d5c3 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -129,8 +129,6 @@ module mvu_vvu_axi #( end end - uwire clk = ap_clk; - uwire clk2x = ap_clk2x; uwire rst = !ap_rst_n; //- Replay to Accommodate Neuron Fold ----------------------------------- @@ -144,7 +142,7 @@ module mvu_vvu_axi #( localparam int unsigned SF = MW/SIMD; localparam int unsigned NF = MH/PE; replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay ( - .clk, .rst, + .clk(ap_clk), .rst, .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) ); @@ -190,7 +188,6 @@ module mvu_vvu_axi #( typedef logic [PE -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH -1:0] dsp_w_t; typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] dsp_a_t; - uwire dsp_clk; uwire dsp_en; uwire dsp_last; @@ -202,8 +199,7 @@ module mvu_vvu_axi #( uwire dsp_p_t dsp_p; if(!PUMPED_COMPUTE) begin : genUnpumpedCompute - assign dsp_clk = clk; - assign dsp_en = en; + assign dsp_en = en; assign dsp_last = alast && avld; assign dsp_zero = !istb; @@ -214,15 +210,14 @@ module mvu_vvu_axi #( assign odat = dsp_p; end : genUnpumpedCompute else begin : genPumpedCompute - assign dsp_clk = clk2x; // Identify second fast cycle just before active slow clock edge logic Active = 0; if(1) begin : blkActive uwire clk_lut[2]; // Put some LUT delay on the input from the fast clock net - (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk)); + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(ap_clk)); (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0])); - always_ff @(posedge clk2x) Active <= clk_lut[1]; + always_ff @(posedge ap_clk2x) Active <= clk_lut[1]; end : blkActive // The input for a slow cycle is split across two fast cycles along the SIMD dimension. @@ -237,7 +232,7 @@ module mvu_vvu_axi #( for(genvar i = 0; i < SIMD; i++) assign w[i] = mvu_w[pe][i]; for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign w[i] = 0; - always_ff @(posedge clk2x) begin + always_ff @(posedge ap_clk2x) begin if(rst) W[pe] <= 'x; else if(en) W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD]; end @@ -251,7 +246,7 @@ module mvu_vvu_axi #( for(genvar i = 0; i < SIMD; i++) assign a[i] = amvau_i[pe][i]; for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign a[i] = 0; - always_ff @(posedge clk2x) begin + always_ff @(posedge ap_clk2x) begin if(rst) A[pe] <= 'x; else if(en) A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD]; end @@ -260,7 +255,7 @@ module mvu_vvu_axi #( logic Zero = 1; logic Last = 0; - always_ff @(posedge clk2x) begin + always_ff @(posedge ap_clk2x) begin if(rst) begin Zero <= 1; Last <= 0; @@ -283,7 +278,7 @@ module mvu_vvu_axi #( // clock to pick it up. logic Vld = 0; dsp_p_t P = 'x; - always_ff @(posedge clk2x) begin + always_ff @(posedge ap_clk2x) begin if(rst) begin Vld <= 0; P <= 'x; @@ -307,7 +302,7 @@ module mvu_vvu_axi #( .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) core ( - .clk(dsp_clk), .rst, .en(dsp_en), + .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en), .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), .vld(dsp_vld), .p(dsp_p) ); @@ -318,7 +313,7 @@ module mvu_vvu_axi #( .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS), .VERSION(1), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) core ( - .clk(dsp_clk), .rst, .en(dsp_en), + .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en), .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), .vld(dsp_vld), .p(dsp_p) ); @@ -329,7 +324,7 @@ module mvu_vvu_axi #( .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS), .VERSION(2), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) core ( - .clk(dsp_clk), .rst, .en(dsp_en), + .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en), .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), .vld(dsp_vld), .p(dsp_p) ); @@ -339,7 +334,7 @@ module mvu_vvu_axi #( .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) core ( - .clk(dsp_clk), .rst, .en(dsp_en), + .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en), .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), .vld(dsp_vld), .p(dsp_p) ); @@ -366,7 +361,7 @@ module mvu_vvu_axi #( assign en = A.rdy; uwire b_load = !B.vld || m_axis_output_tready; - always_ff @(posedge clk) begin + always_ff @(posedge ap_clk) begin if(rst) begin A <= '{ rdy: 1, default: 'x }; B <= '{ vld: 0, default: 'x }; From d6cd7562f9cbff0bdb2516ea9fac6f75e2df0634 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Mon, 27 Jan 2025 19:39:05 +0000 Subject: [PATCH 2/4] Free running DSP cores without enables. --- finn-rtllib/mvu/mvu_4sx4u.sv | 8 ++ finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 8 ++ finn-rtllib/mvu/mvu_pkg.sv | 22 +++++ finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 11 ++- finn-rtllib/mvu/mvu_vvu_axi.sv | 122 ++++++++++++++------------ 5 files changed, 115 insertions(+), 56 deletions(-) create mode 100644 finn-rtllib/mvu/mvu_pkg.sv diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 1f6e97281e..5541ee74f9 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -58,6 +58,8 @@ module mvu_4sx4u #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); + import mvu_pkg::*; + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR @@ -129,6 +131,12 @@ module mvu_4sx4u #( else if(en) L <= { last, L[1:4] }; end assign vld = L[5]; + initial begin + if(mvu_pipeline_depth("mvu_4sx4u") < $bits(L)) begin + $error("%m: Outdated pipeline depth computation."); + $stop; + end + end // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism localparam int unsigned PIPE_COUNT = (PE+3)/4; diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index dabb36647e..fbe85b624f 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -57,6 +57,8 @@ module mvu_8sx8u_dsp48 #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); + import mvu_pkg::*; + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR @@ -83,6 +85,12 @@ module mvu_8sx8u_dsp48 #( else if(en) L <= { last, L[1:4] }; end assign vld = L[5]; + initial begin + if(mvu_pipeline_depth("mvu_8sx8u_dsp48") < $bits(L)) begin + $error("%m: Outdated pipeline depth computation."); + $stop; + end + end // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH; diff --git a/finn-rtllib/mvu/mvu_pkg.sv b/finn-rtllib/mvu/mvu_pkg.sv new file mode 100644 index 0000000000..fa613b2aae --- /dev/null +++ b/finn-rtllib/mvu/mvu_pkg.sv @@ -0,0 +1,22 @@ +package mvu_pkg; + function int unsigned mvu_pipeline_depth( + input string core, + input int unsigned simd = 0, + input int unsigned seglen = 0 + ); + unique case(core) + "mvu_vvu_8sx9_dsp58": begin + automatic int chainlen = (simd+2)/3; + if(seglen == 0) seglen = chainlen; + return 3 + (chainlen-1)/seglen; + end + "mvu_4sx4u", "mvu_4sx4u_dsp48e1", "mvu_4sx4u_dsp48e2", + "mvu_8sx8u_dsp48": + return 5; + default: begin + $error("Unknown MVU core '%s'", core); + $finish; + end + endcase + endfunction : mvu_pipeline_depth +endpackage : mvu_pkg diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv index 2734f37cf3..7515b2e868 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -45,8 +45,7 @@ module mvu_vvu_8sx9_dsp58 #( localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD, localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD - ) - ( + ) ( // Global Control input logic clk, input logic rst, @@ -62,6 +61,8 @@ module mvu_vvu_8sx9_dsp58 #( output logic vld, output logic [PE-1:0][ACCU_WIDTH-1:0] p ); + import mvu_pkg::*; + // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR @@ -90,6 +91,12 @@ module mvu_vvu_8sx9_dsp58 #( end end assign vld = L[0]; + initial begin + if(mvu_pipeline_depth("mvu_vvu_8sx9_dsp58", SIMD, SEGMENTLEN) < $bits(L)) begin + $error("%m: Outdated pipeline depth computation."); + $stop; + end + end //-------------------- Shift register for ZERO flag --------------------\\ logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index cdc770d5c3..00511bb167 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -91,6 +91,7 @@ module mvu_vvu_axi #( output logic m_axis_output_tvalid, input logic m_axis_output_tready ); + import mvu_pkg::*; //-------------------- Parameter sanity checks --------------------\\ initial begin @@ -173,10 +174,9 @@ module mvu_vvu_axi #( end : genVVUInput //- Flow Control Bracket around Compute Core ---------------------------- - uwire en; - uwire istb = avld && s_axis_weights_tvalid; - assign ardy = en && s_axis_weights_tvalid; - assign s_axis_weights_tready = en && avld; + uwire idle; + assign ardy = !idle && s_axis_weights_tvalid; + assign s_axis_weights_tready = !idle && avld; //- Conditionally Pumped DSP Compute ------------------------------------ typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; @@ -188,8 +188,6 @@ module mvu_vvu_axi #( typedef logic [PE -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH -1:0] dsp_w_t; typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] dsp_a_t; - uwire dsp_en; - uwire dsp_last; uwire dsp_zero; uwire dsp_w_t dsp_w; @@ -199,10 +197,8 @@ module mvu_vvu_axi #( uwire dsp_p_t dsp_p; if(!PUMPED_COMPUTE) begin : genUnpumpedCompute - assign dsp_en = en; - - assign dsp_last = alast && avld; - assign dsp_zero = !istb; + assign dsp_last = alast && avld && !idle; + assign dsp_zero = idle || !s_axis_weights_tvalid || !avld; assign dsp_w = mvu_w; assign dsp_a = amvau_i; @@ -261,12 +257,11 @@ module mvu_vvu_axi #( Last <= 0; end else if(en) begin - Zero <= !istb; - Last <= alast && avld && Active; + Zero <= idle || !s_axis_weights_tvalid || !avld; + Last <= alast && avld && !idle && Active; end end - assign dsp_en = en; assign dsp_last = Last; assign dsp_zero = Zero; assign dsp_w = W; @@ -294,7 +289,7 @@ module mvu_vvu_axi #( end : genPumpedCompute case(COMPUTE_CORE) - "mvu_vvu_8sx9_dsp58": + "mvu_vvu_8sx9_dsp58": begin : core mvu_vvu_8sx9_dsp58 #( .IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), @@ -302,42 +297,46 @@ module mvu_vvu_axi #( .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) core ( - .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en), + .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en('1), .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), .vld(dsp_vld), .p(dsp_p) ); - "mvu_4sx4u_dsp48e1": + end + "mvu_4sx4u_dsp48e1": begin : core mvu_4sx4u #( .PE(PE), .SIMD(DSP_SIMD), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS), .VERSION(1), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) core ( - .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en), + .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en('1), .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), .vld(dsp_vld), .p(dsp_p) ); - "mvu_4sx4u_dsp48e2": + end + "mvu_4sx4u_dsp48e2": begin : core mvu_4sx4u #( .PE(PE), .SIMD(DSP_SIMD), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS), .VERSION(2), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) core ( - .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en), + .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en('1), .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), .vld(dsp_vld), .p(dsp_p) ); - "mvu_8sx8u_dsp48": + end + "mvu_8sx8u_dsp48": begin : core mvu_8sx8u_dsp48 #( .PE(PE), .SIMD(DSP_SIMD), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) core ( - .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en(dsp_en), + .clk(PUMPED_COMPUTE? ap_clk2x : ap_clk), .rst, .en('1), .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), .vld(dsp_vld), .p(dsp_p) ); + end default: initial begin $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); $finish; @@ -346,41 +345,56 @@ module mvu_vvu_axi #( end : blkDsp -//-------------------- Output register slice --------------------\\ - // Make `en`computation independent from external inputs. - // Drive all outputs from registers. - struct packed { - logic rdy; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } A = '{ rdy: 1, default: 'x }; // side-step register used when encountering backpressure - struct packed { - logic vld; - logic [PE-1:0][ACCU_WIDTH-1:0] dat; - } B = '{ vld: 0, default: 'x }; // ultimate output register - - assign en = A.rdy; - uwire b_load = !B.vld || m_axis_output_tready; - - always_ff @(posedge ap_clk) begin - if(rst) begin - A <= '{ rdy: 1, default: 'x }; - B <= '{ vld: 0, default: 'x }; + if(1) begin : blkOutput + localparam int unsigned PIPELINE_DEPTH = mvu_pipeline_depth(COMPUTE_CORE, SIMD, SEGMENTLEN); + localparam int unsigned MIN_OUT_PERIOD = MW / SIMD; + localparam int unsigned MAX_IN_FLIGHT = 1 + PIPELINE_DEPTH / MIN_OUT_PERIOD; + + typedef logic [PE-1:0][ACCU_WIDTH-1:0] output_t; + + logic signed [$clog2(MAX_IN_FLIGHT+1):0] OPtr = '1; // -1 | 0, 1, ..., MAX_IN_FLIGHT + output_t OBuf[0:MAX_IN_FLIGHT]; + logic OVld = 0; + output_t OReg = 'x; + logic OLock = 0; // Lock upon backpressure (second entry into queue) + + // Catch every output into (SRL) Output Queue + always_ff @(posedge ap_clk) begin + if(ovld) OBuf <= { odat, OBuf[0:MAX_IN_FLIGHT-1] }; end - else begin - if(A.rdy) A.dat <= odat; - A.rdy <= (A.rdy && !ovld) || b_load; - - if(b_load) begin - B <= '{ - vld: ovld || !A.rdy, - dat: A.rdy? odat : A.dat - }; + + always_ff @(posedge ap_clk) begin + if(rst) begin + OPtr <= '1; + OVld <= 0; + OReg <= 'x; + OLock <= 0; + end + else begin + automatic logic push = ovld; + automatic logic pop = (m_axis_output_tready || !OVld) && !OPtr[$left(OPtr)]; + assert(pop || !push || (OPtr < $signed(MAX_IN_FLIGHT))) else begin + $error("%m: Overflowing output queue."); + $stop; + end + OPtr <= OPtr + $signed(push == pop? 0 : push? 1 : -1); + + if(OPtr[$left(OPtr)]) OLock <= 0; + else if(OVld && !m_axis_output_tready) OLock <= 1; + + if(m_axis_output_tready || !OVld) begin + OVld <= !OPtr[$left(OPtr)]; + OReg <= OBuf[OPtr[$left(OPtr)-1:0]]; + end end end - end - assign m_axis_output_tvalid = B.vld; - // Why would we need a sign extension here potentially creating a higher signal load into the next FIFO? - // These extra bits should never be used. Why not 'x them out? - assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; + assign idle = OLock; + + assign m_axis_output_tvalid = OVld; + // Why would we need a sign extension here potentially creating a higher signal load into the next FIFO? + // These extra bits should never be used. Why not 'x them out? + assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){OReg[PE-1][ACCU_WIDTH-1]}}, OReg }; + + end : blkOutput endmodule : mvu_vvu_axi From ae0574cf57ef7d1da564143ce1f438784f2c24a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Mon, 27 Jan 2025 20:20:25 +0000 Subject: [PATCH 3/4] Remove obsolete references to `en` signal. --- finn-rtllib/mvu/mvu_vvu_axi.sv | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 00511bb167..368c338f37 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -229,8 +229,8 @@ module mvu_vvu_axi #( for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign w[i] = 0; always_ff @(posedge ap_clk2x) begin - if(rst) W[pe] <= 'x; - else if(en) W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + if(rst) W[pe] <= 'x; + else W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD]; end end : genPERegW @@ -243,8 +243,8 @@ module mvu_vvu_axi #( for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign a[i] = 0; always_ff @(posedge ap_clk2x) begin - if(rst) A[pe] <= 'x; - else if(en) A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + if(rst) A[pe] <= 'x; + else A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD]; end end : genPERegA @@ -256,7 +256,7 @@ module mvu_vvu_axi #( Zero <= 1; Last <= 0; end - else if(en) begin + else begin Zero <= idle || !s_axis_weights_tvalid || !avld; Last <= alast && avld && !idle && Active; end @@ -278,7 +278,7 @@ module mvu_vvu_axi #( Vld <= 0; P <= 'x; end - else if(en) begin + else begin if(dsp_vld) P <= dsp_p; Vld <= dsp_vld || (Vld && !Active); end @@ -353,6 +353,7 @@ module mvu_vvu_axi #( typedef logic [PE-1:0][ACCU_WIDTH-1:0] output_t; logic signed [$clog2(MAX_IN_FLIGHT+1):0] OPtr = '1; // -1 | 0, 1, ..., MAX_IN_FLIGHT + (* SHREG_EXTRACT = "YES" *) output_t OBuf[0:MAX_IN_FLIGHT]; logic OVld = 0; output_t OReg = 'x; From 92bcbb3a34a0fe9ac9f5dedb0ee174cc851cb72e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Fri, 31 Jan 2025 12:57:34 +0000 Subject: [PATCH 4/4] Abolish package for synthesis compatibiity and homogenize pipelining. --- finn-rtllib/mvu/mvu_4sx4u.sv | 412 ++++++++++++++++++++------ finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 333 +++++++++++++-------- finn-rtllib/mvu/mvu_pkg.sv | 22 -- finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 27 +- finn-rtllib/mvu/mvu_vvu_axi.sv | 9 +- 5 files changed, 557 insertions(+), 246 deletions(-) delete mode 100644 finn-rtllib/mvu/mvu_pkg.sv diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index 5541ee74f9..5dadefbaa2 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -39,6 +39,7 @@ module mvu_4sx4u #( int unsigned ACCU_WIDTH, int unsigned VERSION = 1, // Version 1 (DSP48E1) *must* commit to NARROW_WEIGHTS + // Allowed versions - 1: DSP48E1, 2: DSP48E2, 3: DSP58 bit SIGNED_ACTIVATIONS = 0, bit NARROW_WEIGHTS = 0, // Weights from [-7:7] rather than [-8:7] bit FORCE_BEHAVIORAL = 0 @@ -58,9 +59,8 @@ module mvu_4sx4u #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); - import mvu_pkg::*; - // for verilator always use behavioral code + // For Verilator: always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR 1 || @@ -68,10 +68,39 @@ module mvu_4sx4u #( FORCE_BEHAVIORAL; //----------------------------------------------------------------------- - // Determine Lane Configuration + // Startup Recovery Watchdog + // The DSP slice needs 100ns of recovery time after initial startup before + // being able to ingest input properly. This watchdog discovers violating + // stimuli during simulation and produces a corresponding warning. + if(1) begin : blkRecoveryWatch + logic Dirty = 1; + initial begin + #100ns; + Dirty <= 0; + end + + always_ff @(posedge clk) begin + assert(!Dirty || rst || !en || zero) else begin + $warning("%m: Feeding input during DSP startup recovery. Expect functional errors."); + $stop; + end + end + end : blkRecoveryWatch + + //----------------------------------------------------------------------- + // Determine version-specific constraints + typedef enum { DSP48E1 = 1, DSP48E2 = 2, DSP58 = 3 } dsp_version_e; + localparam int unsigned A_WIDTH = 25 + 2*(VERSION > 1); // Width of A datapath + localparam int unsigned B_WIDTH = 18 + 6*(VERSION > 2); // Width of B datapath + localparam int unsigned P_WIDTH = VERSION == DSP58? 58 : 48; // Width of P datapath + initial begin - if(!NARROW_WEIGHTS && (VERSION == 1)) begin - $error("%m: Need NARROW_WEIGHTS for DSP48E1."); + if(WEIGHT_WIDTH > 4) begin + $error("%m: Requested WEIGHT_WIDTH=%0d beyond support for 4 bits.", WEIGHT_WIDTH); + $finish; + end + if(ACTIVATION_WIDTH > 4) begin + $error("%m: Requested ACTIVATION_WIDTH=%0d beyond support for 4 bits.", ACTIVATION_WIDTH); $finish; end end @@ -80,63 +109,87 @@ module mvu_4sx4u #( * Lane Slicing * Assumptions: * - Internal lane widths differ, at most, by a single bit. + * - The minimum lane width is WEIGHT_WIDTH + ACTIVATION_WIDTH - 1 bits + * so as to confine cross-lane overflows to {-1,0,1}. * - The rightmost lane (#0) has the maximum internal width. * - The leftmost lane (#3) extends into the wide DSP accumulation path and - * is constrained by ACCU_WIDTH rather than the next lane. It doesn't have - * an external high extension. + * is typically constrained by ACCU_WIDTH rather than the next lane. If so, + * it doesn't have an external high extension. * - The one but leftmost lane (#2) has the minimum internal width and, hence, - * the macimum external high extension. + * the maximum external high extension. */ typedef int unsigned lane_offset_v[4:0]; function lane_offset_v sliceLanes(); - unique case(VERSION) - 1: begin - return NARROW_WEIGHTS? - lane_offset_v'{ ACCU_WIDTH+21, 21, 14, 7, 0 } : - lane_offset_v'{ 0, 0, 0, 0, 0 }; // not supported + localparam int unsigned MIN_LANE_WIDTH = WEIGHT_WIDTH + ACTIVATION_WIDTH - 1; + automatic lane_offset_v res; + + // Determine number of bits beyond accommodating minimum lane width + automatic int bit_slack = A_WIDTH; + // protect sign if not narrow, leftmost weight entry, minimum for rest of lanes + bit_slack -= !NARROW_WEIGHTS + WEIGHT_WIDTH + 3*MIN_LANE_WIDTH; + if(bit_slack < 0) begin + localparam dsp_version_e VER = dsp_version_e'(VERSION); + $error( + "%m: Cannot accommodate %0d-bit %snarrow weights on %s.", + WEIGHT_WIDTH, NARROW_WEIGHTS? "" : "non-", VER.name + ); + $finish; end - 2: begin - return NARROW_WEIGHTS? - lane_offset_v'{ ACCU_WIDTH+23, 23, 16, 8, 0 } : - lane_offset_v'{ ACCU_WIDTH+22, 22, 15, 8, 0 }; + + // Distribute slack bits preferring right lanes + res[0] = 0; + for(int unsigned i = 1; i < 4; i++) begin + automatic int unsigned extra = (bit_slack + (3-i)) / (4-i); + res[i] = res[i-1] + MIN_LANE_WIDTH + extra; + bit_slack -= extra; end - endcase + + // Last lane bounded by the smaller of ACCU_WIDTH or P datapath + res[4] = res[3] + ACCU_WIDTH; + if(res[4] > P_WIDTH) res[4] = P_WIDTH; + + return res; endfunction : sliceLanes localparam lane_offset_v OFFSETS = sliceLanes(); + function int unsigned sum_width(input int unsigned n, input int unsigned w); + return w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n); + endfunction : sum_width function int unsigned lo_width(input int unsigned i); return OFFSETS[i+1] - OFFSETS[i]; endfunction : lo_width function int unsigned hi_width(input int unsigned i); - return 1 + $clog2(2**(ACCU_WIDTH-lo_width(i)-1)+SIMD); + automatic int unsigned lw = lo_width(i); + return ACCU_WIDTH <= lw? + 0 : + 1 + ($clog2(SIMD) < ACCU_WIDTH-lw? + ACCU_WIDTH-lw : + $clog2(2**(ACCU_WIDTH-lw-1)+SIMD) + ); endfunction : hi_width - localparam int unsigned LO_WIDTH_MAX = OFFSETS[1] - OFFSETS[0]; + localparam int unsigned LO_WIDTH_MAX = lo_width(3); localparam int unsigned HI_WIDTH_MAX = hi_width(2); - localparam int unsigned A_WIDTH = 23 + 2*VERSION; // Width of A datapath - // Compute the count of decendents for all nodes in the reduction trees. - typedef int unsigned leave_load_t[2*SIMD-1]; - function leave_load_t init_leave_loads(); - automatic leave_load_t res; + typedef int unsigned leaf_load_t[2*SIMD-1]; + function leaf_load_t init_leaf_loads(); + automatic leaf_load_t res; for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; return res; - endfunction : init_leave_loads + endfunction : init_leaf_loads // Pipeline for last indicator flag - logic [1:5] L = '0; + // Depth: 3 cycles for DSP + external SIMD reduction + localparam int unsigned PIPELINE_DEPTH = 3 + $clog2(SIMD+1) + (SIMD == 1); +/* verilator lint_off LITENDIAN */ + logic [1:PIPELINE_DEPTH] L = '0; +/* verilator lint_on LITENDIAN */ always_ff @(posedge clk) begin if(rst) L <= '0; - else if(en) L <= { last, L[1:4] }; - end - assign vld = L[5]; - initial begin - if(mvu_pipeline_depth("mvu_4sx4u") < $bits(L)) begin - $error("%m: Outdated pipeline depth computation."); - $stop; - end + else if(en) L <= { last, L[1:PIPELINE_DEPTH-1] }; end + assign vld = L[PIPELINE_DEPTH]; // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism localparam int unsigned PIPE_COUNT = (PE+3)/4; @@ -146,15 +199,15 @@ module mvu_4sx4u #( localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); localparam int unsigned PE_REM = 4*(c+1) - PE_END; - uwire [47:0] p3[SIMD]; - uwire signed [ 1:0] h3[SIMD][3]; + uwire [P_WIDTH-1:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD][4]; for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [17:0] bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; - logic [29:0] aa; - logic [26:0] dd; - logic [ 1:0] xx[3:1]; + uwire [B_WIDTH-1:0] bb = { {(B_WIDTH-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; + logic [A_WIDTH-1:0] aa; + logic [A_WIDTH-1:0] dd; + logic [1:0] xx[3:1]; if(1) begin : blkVectorize uwire signed [3:0] ww[PE_END - PE_BEG]; for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin @@ -182,23 +235,31 @@ module mvu_4sx4u #( aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin automatic int unsigned ofs = OFFSETS[pe + PE_REM]; - dd[ofs+:3] = ww[pe]; - assert(!NARROW_WEIGHTS || rst || !en || zero || (ww[pe] != -8)) else begin - $warning("%m: Weight of -8 violates NARROW_WEIGHTS commitment."); - end + dd[ofs+:WEIGHT_WIDTH-1] = ww[pe][0+:WEIGHT_WIDTH-1]; // The sign of the weights are generally put on the subtracted A port. // However, when coinciding with the actual sign bit position of the // multiplier input path, it also goes onto the D input. This prevents // sign extensions that may happen when a DSP primitive is auto-promoted // to a newer generation. - if(ofs+3 == A_WIDTH-1) dd[ofs+3] = ww[pe][3]; - else aa[ofs+3] = ww[pe][3]; + if(ofs+WEIGHT_WIDTH-1 == A_WIDTH-1) dd[ofs+WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + else aa[ofs+WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; end end + if(NARROW_WEIGHTS) begin : genNarrowCheck + always_ff @(posedge clk iff en && !rst) begin + foreach(ww[pe]) begin + assert(zero || (ww[pe] !== -2**(WEIGHT_WIDTH-1))) else begin + $warning("%m: Weight of %0x violates NARROW_WEIGHTS commitment.", ww[pe]); + $stop; + end + end + end + end + end : blkVectorize - uwire [47:0] pp; + uwire [P_WIDTH-1:0] pp; // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B @@ -206,7 +267,7 @@ module mvu_4sx4u #( if(BEHAVIORAL) begin : genBehav // Stage #1: Input Refine - logic signed [17:0] B1 = 0; + logic signed [B_WIDTH-1:0] B1 = 0; always_ff @(posedge clk) begin if(zero) B1 <= 0; else if(en) B1 <= bb; @@ -219,7 +280,7 @@ module mvu_4sx4u #( end // Stage #2: Multiply - logic signed [45:0] M2 = 0; + logic signed [A_WIDTH+B_WIDTH-1:0] M2 = 0; always_ff @(posedge clk) begin if(rst) M2 <= 0; else if(en) M2 <= @@ -230,7 +291,7 @@ module mvu_4sx4u #( end // Stage #3: Accumulate - logic signed [47:0] P3 = 0; + logic signed [P_WIDTH-1:0] P3 = 0; always_ff @(posedge clk) begin if(rst) P3 <= 0; else if(en) P3 <= M2 + (L[3]? 0 : P3); @@ -243,7 +304,7 @@ module mvu_4sx4u #( localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; case(VERSION) - 1: DSP48E1 #( + DSP48E1: DSP48E1 #( // Feature Control Attributes: Data Path Selection .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) @@ -307,7 +368,7 @@ module mvu_4sx4u #( .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input // Data: 30-bit (each) input: Data Ports - .A(aa), // 30-bit input: A data input + .A({5'b0, aa}), // 30-bit input: A data input .B(bb), // 18-bit input: B data input .C('x), // 48-bit input: C data input .CARRYIN('0), // 1-bit input: Carry input signal @@ -348,7 +409,7 @@ module mvu_4sx4u #( .RSTM(rst), // 1-bit input: Reset for MREG .RSTP(rst) // 1-bit input: Reset for PREG ); - 2: DSP48E2 #( + DSP48E2: DSP48E2 #( // Feature Control Attributes: Data Path Selection .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) @@ -436,13 +497,147 @@ module mvu_4sx4u #( .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode // Data inputs: Data Ports - .A(aa), // 34-bit input: A data + .A({3'b0, aa}), // 30-bit input: A data + .B(bb), // 18-bit input: B data + .C('x), // 48-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + DSP58: DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT24"), + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"),// Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_NEGATE_INVERTED('0), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A({7'b0, aa}), // 34-bit input: A data .B(bb), // 24-bit input: B data .C('x), // 58-bit input: C data .CARRYIN('0), // 1-bit input: Carry-in .D(dd), // 27-bit input: D data // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG .CEAD(en), // 1-bit input: Clock enable for ADREG @@ -478,7 +673,7 @@ module mvu_4sx4u #( .RSTP(rst) // 1-bit input: Reset for PREG ); default: initial begin - $error("Unknown version DSP48E%0d.", VERSION); + $error("Unknown DSP version."); $finish; end endcase @@ -508,33 +703,54 @@ module mvu_4sx4u #( for(genvar i = 0; i < 3; i++) begin assign h3[s][i] = pp[OFFSETS[i+1]+:2] - X3[i+1]; end + // Overflow out of high lane + logic PZ = 0; + always_ff @(posedge clk) begin + if(rst) PZ <= 0; + else if(en) PZ <= L[3]? 0 : pp[$left(pp)]; + end + assign h3[s][3] = + ( PZ && !pp[$left(pp)-:2])? +1 : + (!PZ && &pp[$left(pp)-:2])? -1 : 0; + assign p3[s] = pp; end : genSIMD - // Stage #4: Cross-SIMD Reduction + // Stage #4: Potentially Multiple Cycles of Cross-SIMD Reduction + // - binary reduction trees with SIMD leaf nodes for both the core lane outputs and the spill accumulation + // - balanced tree construction with all fully occupied levels pipelined // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop + localparam leaf_load_t LEAF_LOAD = SIMD > 1 ? init_leaf_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leaf_loads ends up in infinite loop + localparam int unsigned HI_NODE_REGISTERED = 2**($clog2(SIMD+1)-1)-2; - uwire signed [ACCU_WIDTH-1:0] up4; - uwire signed [ HI_WIDTH_MAX-1:0] hi4[3]; - uwire [$clog2(SIMD)+LO_WIDTH_MAX-1:0] lo4[3]; - for(genvar i = 0; i < 4; i++) begin + uwire signed [HI_WIDTH_MAX-1:0] hi4[4]; + uwire [LO_WIDTH_MAX-1:0] lo4[4]; + for(genvar i = 0; i < 4; i++) begin : genLanes // Conclusive high part accumulation - if(i < 3) begin : genHi - if(i < PE_REM) assign hi4[i] = '0; + if(i < PE_REM) assign hi4[i] = 0; + else begin : genHi + localparam int unsigned HI_WIDTH = hi_width(i); + if(HI_WIDTH == 0) assign hi4[i] = 0; else begin - localparam int unsigned HI_WIDTH = hi_width(i); - // Adder Tree across all SIMD high contributions, each from [-1:1] uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; for(genvar n = 0; n < SIMD-1; n++) begin // Sum truncated to actual maximum bit width at this node - uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); - assign tree[n] = s; + typedef logic signed [$clog2(1+LEAF_LOAD[n]):0] sum_t; + uwire sum_t s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + if((0 < n) && (n <= HI_NODE_REGISTERED)) begin + sum_t S = 0; + always_ff @(posedge clk) begin + if(rst) S <= 0; + else if(en) S <= s; + end + assign tree[n] = S; + end + else assign tree[n] = s; end // High Sideband Accumulation @@ -542,16 +758,15 @@ module mvu_4sx4u #( always_ff @(posedge clk) begin if(rst) Hi4 <= 0; else if(en) begin - automatic logic signed [HI_WIDTH:0] h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]); - assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin - $error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH); + automatic logic signed [HI_WIDTH:0] h = $signed(L[PIPELINE_DEPTH-1]? {(HI_WIDTH){1'b0}} : Hi4) + $signed(tree[0]); + assert(h[HI_WIDTH] === h[HI_WIDTH-1]) else begin + $error("%m [%0d:%0d]: Accumulation overflow for ACCU_WIDTH=%0d", c, i, ACCU_WIDTH); $stop; end - Hi4 <= h; + Hi4 <= h[HI_WIDTH-1:0]; end end assign hi4[i] = Hi4; - end end : genHi @@ -561,34 +776,47 @@ module mvu_4sx4u #( localparam int unsigned LO_WIDTH = lo_width(i); // Adder Tree across all SIMD low contributions - localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + localparam int unsigned ROOT_WIDTH = sum_width(SIMD, LO_WIDTH); uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); - uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - logic [ROOT_WIDTH-1:0] Lo4 = 0; - always_ff @(posedge clk) begin - if(rst) Lo4 <= 0; - else if(en) Lo4 <= tree[0]; - end + if(SIMD == 1) begin : genReg + // Just slide in a balancing register + logic [ROOT_WIDTH-1:0] R = 'x; + always_ff @(posedge clk) begin + if(rst) R <= 'x; + else if(en) R <= p3[0][OFFSETS[i]+:LO_WIDTH]; + end + assign tree[0] = R; + end : genReg + else begin : genTree + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = sum_width(LEAF_LOAD[n], LO_WIDTH); + uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + if(n <= HI_NODE_REGISTERED) begin + logic [NODE_WIDTH-1:0] S = 'x; + always_ff @(posedge clk) begin + if(rst) S <= 'x; + else if(en) S <= s; + end + assign tree[n] = S; + end + else assign tree[n] = s; + end + end : genTree - if(i == 3) assign up4 = Lo4; - else assign lo4[i] = Lo4; + assign lo4[i] = tree[0]; end : genLo - end + end : genLanes // Stage #5: Resolve lane totals - logic signed [3:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + logic signed [3:0][ACCU_WIDTH-1:0] Res5 = '{ default: 'x }; always_ff @(posedge clk) begin - if(rst) Res5 <= '{ default: 0 }; + if(rst) Res5 <= '{ default: 'x }; else if(en) begin - Res5[3] <= up4 - hi4[2]; + Res5[3] <= $signed({ hi4[3], {(lo_width(3)){1'b0}} }) + $signed({ 1'b0, lo4[3] }) - hi4[2]; Res5[2] <= $signed({ hi4[2], {(lo_width(2)){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; Res5[1] <= $signed({ hi4[1], {(lo_width(1)){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; Res5[0] <= $signed({ hi4[0], {(lo_width(0)){1'b0}} }) + $signed({ 1'b0, lo4[0] }); diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index fbe85b624f..96652893a2 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -34,9 +34,9 @@ module mvu_8sx8u_dsp48 #( int unsigned PE, int unsigned SIMD, - int unsigned WEIGHT_WIDTH, - int unsigned ACTIVATION_WIDTH, int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, int unsigned VERSION = 1, bit SIGNED_ACTIVATIONS = 0, @@ -57,8 +57,6 @@ module mvu_8sx8u_dsp48 #( output logic vld, output logic signed [PE-1:0][ACCU_WIDTH-1:0] p ); - import mvu_pkg::*; - // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR @@ -66,36 +64,115 @@ module mvu_8sx8u_dsp48 #( `endif FORCE_BEHAVIORAL; - typedef int unsigned leave_load_t[2*SIMD-1]; - function leave_load_t init_leave_loads(); - automatic leave_load_t res; - for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; - for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + //----------------------------------------------------------------------- + // Startup Recovery Watchdog + // The DSP slice needs 100ns of recovery time after initial startup before + // being able to ingest input properly. This watchdog discovers violating + // stimuli during simulation and produces a corresponding warning. + if(1) begin : blkRecoveryWatch + logic Dirty = 1; + initial begin + #100ns; + Dirty <= 0; + end + + always_ff @(posedge clk) begin + assert(!Dirty || rst || !en || zero) else begin + $warning("%m: Feeding input during DSP startup recovery. Expect functional errors."); + end + end + end : blkRecoveryWatch + + //----------------------------------------------------------------------- + // Determine version-specific constraints + typedef enum { DSP48E1 = 1, DSP48E2 = 2 } dsp_version_e; + localparam int unsigned A_WIDTH = 25 + 2*(VERSION > 1); // Width of A datapath + localparam int unsigned B_WIDTH = 18; // Width of B datapath + localparam int unsigned P_WIDTH = 48; // Width of P datapath + + /** + * Lane Slicing + * Assumptions: + * - Internal lane widths differ, at most, by a single bit. + * - The minimum lane width is WEIGHT_WIDTH + ACTIVATION_WIDTH - 1 bits + * so as to confine cross-lane overflows to {-1,0,1}. + * - The rightmost lane (#0) has the maximum internal width. + * - The leftmost lane (#3) extends into the wide DSP accumulation path and + * is typically constrained by ACCU_WIDTH rather than the next lane. If so, + * it doesn't have an external high extension. + * - The one but leftmost lane (#2) has the minimum internal width and, hence, + * the maximum external high extension. + */ + typedef int unsigned lane_offset_v[2:0]; + function lane_offset_v sliceLanes(); + localparam int unsigned MIN_LANE_WIDTH = WEIGHT_WIDTH + ACTIVATION_WIDTH - 1; + automatic lane_offset_v res; + + // Determine number of bits beyond accommodating minimum lane width + automatic int bit_slack = A_WIDTH; + // protect sign, leftmost weight entry, minimum for rest of lanes + bit_slack -= 1 + WEIGHT_WIDTH + MIN_LANE_WIDTH; + if(bit_slack < 0) begin + localparam dsp_version_e VER = dsp_version_e'(VERSION); + $error( + "%m: Cannot accommodate %0d-bit weights and %0d-bit activations on %s.", + WEIGHT_WIDTH, ACTIVATION_WIDTH, VER.name + ); + $finish; + end + + // Distribute slack bits preferring right lane + res[0] = 0; + res[1] = MIN_LANE_WIDTH + bit_slack; + + // Last lane bounded by the smaller of ACCU_WIDTH or P datapath + res[2] = res[1] + ACCU_WIDTH; + if(res[2] > P_WIDTH) res[2] = P_WIDTH; + return res; - endfunction : init_leave_loads + endfunction : sliceLanes + localparam lane_offset_v OFFSETS = sliceLanes(); function int unsigned sum_width(input int unsigned n, input int unsigned w); - return w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n); + return w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n); endfunction : sum_width + function int unsigned lo_width(input int unsigned i); + return OFFSETS[i+1] - OFFSETS[i]; + endfunction : lo_width + function int unsigned hi_width(input int unsigned i); + automatic int unsigned lw = lo_width(i); + return ACCU_WIDTH <= lw? + 0 : + 1 + ($clog2(SIMD) < ACCU_WIDTH-lw? + ACCU_WIDTH-lw : + $clog2(2**(ACCU_WIDTH-lw-1)+SIMD) + ); + endfunction : hi_width + localparam int unsigned LO_WIDTH_MAX = lo_width(1); + localparam int unsigned HI_WIDTH_MAX = hi_width(0); + + + typedef int unsigned leaf_load_t[2*SIMD-1]; + function leaf_load_t init_leaf_loads(); + automatic leaf_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leaf_loads // Pipeline for last indicator flag - logic [1:5] L = '0; + // Depth: 3 cycles for DSP + external SIMD reduction + localparam int unsigned PIPELINE_DEPTH = 3 + $clog2(SIMD+1) + (SIMD == 1); +/* verilator lint_off LITENDIAN */ + logic [1:PIPELINE_DEPTH] L = '0; +/* verilator lint_on LITENDIAN */ always_ff @(posedge clk) begin if(rst) L <= '0; - else if(en) L <= { last, L[1:4] }; - end - assign vld = L[5]; - initial begin - if(mvu_pipeline_depth("mvu_8sx8u_dsp48") < $bits(L)) begin - $error("%m: Outdated pipeline depth computation."); - $stop; - end + else if(en) L <= { last, L[1:PIPELINE_DEPTH-1] }; end + assign vld = L[PIPELINE_DEPTH]; // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism - localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH; - localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets - localparam int unsigned PIPE_COUNT = (PE+1)/2; for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes @@ -103,17 +180,17 @@ module mvu_8sx8u_dsp48 #( localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); localparam int unsigned PE_REM = 2*(c+1) - PE_END; - uwire [47:0] p3[SIMD]; - uwire signed [ 1:0] h3[SIMD]; + uwire [P_WIDTH-1:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD][2]; for(genvar s = 0; s < SIMD; s++) begin : genSIMD // Input Lane Assembly - uwire [17:0] bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; - logic [29:0] aa; - logic [26:0] dd; - logic [ 1:0] xx; + uwire [B_WIDTH-1:0] bb = { {(B_WIDTH-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; + logic [A_WIDTH-1:0] aa; + logic [A_WIDTH-1:0] dd; + logic [1:0] xx; if(1) begin : blkVectorize - uwire [WEIGHT_WIDTH-1:0] ww[PE_END - PE_BEG]; + uwire signed [WEIGHT_WIDTH-1:0] ww[PE_END - PE_BEG]; for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin assign ww[pe] = w[PE_BEG + pe][s]; if(pe) begin @@ -138,33 +215,33 @@ module mvu_8sx8u_dsp48 #( dd = '0; aa = '0; for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin - dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe]; - aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + dd[OFFSETS[pe + PE_REM]+:WEIGHT_WIDTH-1] = ww[pe][0+:WEIGHT_WIDTH-1]; + aa[OFFSETS[pe + PE_REM]+ WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; end end end : blkVectorize - uwire [47:0] pp; + uwire [P_WIDTH-1:0] pp; // Note: Since the product B * AD is computed, // rst can be only applied to AD and zero only to B // with the same effect as zeroing both. if(BEHAVIORAL) begin : genBehav // Stage #1: Input Refine - logic signed [17:0] B1 = 0; + logic signed [B_WIDTH-1:0] B1 = 0; always_ff @(posedge clk) begin if(zero) B1 <= 0; else if(en) B1 <= bb; end - logic signed [26:0] AD1 = 0; + logic signed [A_WIDTH-1:0] AD1 = 0; always_ff @(posedge clk) begin if(rst) AD1 <= 0; else if(en) AD1 <= dd - aa; end // Stage #2: Multiply - logic signed [45:0] M2 = 0; + logic signed [A_WIDTH+B_WIDTH-1:0] M2 = 0; always_ff @(posedge clk) begin if(rst) M2 <= 0; else if(en) M2 <= @@ -175,7 +252,7 @@ module mvu_8sx8u_dsp48 #( end // Stage #3: Accumulate - logic signed [47:0] P3 = 0; + logic signed [P_WIDTH-1:0] P3 = 0; always_ff @(posedge clk) begin if(rst) P3 <= 0; else if(en) P3 <= M2 + (L[3]? 0 : P3); @@ -443,111 +520,127 @@ module mvu_8sx8u_dsp48 #( else if(en) begin X1 <= xx; X2 <= X1; - X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]); + X3 <= X2 + (L[3]? 2'h0 : pp[OFFSETS[1]+:2]); end end - // Derive actual cross-lane overflows - assign h3[s] = pp[D[1]+:2] - X3; + // Derive actual cross-lane overflow + assign h3[s][0] = pp[OFFSETS[1]+:2] - X3; + // Overflow out of high lane + logic PZ = 0; + always_ff @(posedge clk) begin + if(rst) PZ <= 0; + else if(en) PZ <= L[3]? 0 : pp[$left(pp)]; + end + assign h3[s][1] = + ( PZ && !pp[$left(pp)-:2])? +1 : + (!PZ && &pp[$left(pp)-:2])? -1 : 0; assign p3[s] = pp; end : genSIMD - // Stage #4: Cross-SIMD Reduction + // Stage #4: Potentially Multiple Cycles of Cross-SIMD Reduction + // - binary reduction trees with SIMD leaf nodes for both the core lane outputs and the spill accumulation + // - balanced tree construction with all fully occupied levels pipelined // Count leaves reachable from each node - localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop - - // Range of Cross-lane Contribution Tracked in Hi4 - /* - * - Assumption: ACCU_WIDTH bounds right lane value at any point in time. - * - The value x beyond the lane boundary is hence bounded by: - * -2^(w-1) <= x <= 2^(w-1)-1 with w = ACCU_WIDTH - D[1] - * - This value decomposes into the tracked overflow h and the overflow l - * from the low SIMD lane reduction with: - * 0 <= l <= SIMD - * - From x = l + h follows: - * h = x - l - * -2^(w-1) - SIMD <= h <= 2^(w-1)-1 - * - This required bit width of the two's complement representation of this - * signed value is determined by its lower bound to be at least: - * 1 + $clog2(2^(w-1)+SIMD) - */ - localparam int unsigned HI_WIDTH = 1 + ($clog2(SIMD) < ACCU_WIDTH-D[1]? ACCU_WIDTH-D[1] : $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD)); - - uwire signed [ACCU_WIDTH -1:0] up4; - uwire signed [HI_WIDTH -1:0] hi4; - uwire [$clog2(SIMD)+D[1]-1:0] lo4; - - // Conclusive high part accumulation - if(PE_REM == 0) begin : genHi - - // Adder Tree across all SIMD high contributions, each from [-1:1] - uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); - assign tree[n] = s; - end + localparam leaf_load_t LEAF_LOAD = SIMD > 1 ? init_leaf_loads() : '{ default: 1 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leaf_loads ends up in infinite loop + localparam int unsigned HI_NODE_REGISTERED = 2**($clog2(SIMD+1)-1)-2; + + uwire signed [HI_WIDTH_MAX-1:0] hi4[2]; + uwire [LO_WIDTH_MAX-1:0] lo4[2]; + for(genvar i = 0; i < 2; i++) begin : genLanes + + // Conclusive high part accumulation + if(i < PE_REM) assign hi4[i] = 0; + else begin : genHi + localparam int unsigned HI_WIDTH = hi_width(i); + if(HI_WIDTH == 0) assign hi4[i] = 0; + else begin + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + typedef logic signed [$clog2(1+LEAF_LOAD[n]):0] sum_t; + uwire sum_t s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + if((0 < n) && (n <= HI_NODE_REGISTERED)) begin + sum_t S = 0; + always_ff @(posedge clk) begin + if(rst) S <= 0; + else if(en) S <= s; + end + assign tree[n] = S; + end + else assign tree[n] = s; + end - // High Sideband Accumulation - logic signed [HI_WIDTH-1:0] Hi4 = 0; - always_ff @(posedge clk) begin - if(rst) Hi4 <= 0; - else if(en) begin - automatic logic signed [HI_WIDTH:0] h = $signed(L[4]? 0 : Hi4) + $signed(tree[0]); - assert(h[HI_WIDTH] == h[HI_WIDTH-1]) else begin - $error("%m: Accumulation overflow for ACCU_WIDTH=%0d", ACCU_WIDTH); - $stop; + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) begin + automatic logic signed [HI_WIDTH:0] h = $signed(L[PIPELINE_DEPTH-1]? {(HI_WIDTH){1'b0}} : Hi4) + $signed(tree[0]); + assert(h[HI_WIDTH] === h[HI_WIDTH-1]) else begin + $error("%m [%0d:%0d]: Accumulation overflow for ACCU_WIDTH=%0d", c, i, ACCU_WIDTH); + $stop; + end + Hi4 <= h[HI_WIDTH-1:0]; + end end - Hi4 <= h; + assign hi4[i] = Hi4; end - end - assign hi4 = Hi4; - end : genHi - else begin : genHiZero - assign hi4 = '0; - end : genHiZero - - for(genvar i = 0; i < 2; i++) begin - localparam int unsigned LO_WIDTH = D[i+1] - D[i]; - // Conclusive low part accumulation - if(i >= PE_REM) begin : blkLo - // Adder Tree across all SIMD low contributions (all unsigned arithmetic) + end : genHi + + // Conclusive low part accumulation (all unsigned arithmetic) + if(i < PE_REM) assign lo4[i] = '0; + else begin : genLo + localparam int unsigned LO_WIDTH = lo_width(i); + + // Adder Tree across all SIMD low contributions localparam int unsigned ROOT_WIDTH = sum_width(SIMD, LO_WIDTH); uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; - for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; - for(genvar n = 0; n < SIMD-1; n++) begin - // Sum truncated to actual maximum bit width at this node - localparam int unsigned NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH); - uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; - assign tree[n] = s; - end - logic [ROOT_WIDTH-1:0] Lo4 = 0; - always_ff @(posedge clk) begin - if(rst) Lo4 <= 0; - else if(en) Lo4 <= tree[0]; - end + if(SIMD == 1) begin : genReg + // Just slide in a balancing register + logic [ROOT_WIDTH-1:0] R = 'x; + always_ff @(posedge clk) begin + if(rst) R <= 'x; + else if(en) R <= p3[0][OFFSETS[i]+:LO_WIDTH]; + end + assign tree[0] = R; + end : genReg + else begin : genTree + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][OFFSETS[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = sum_width(LEAF_LOAD[n], LO_WIDTH); + uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2]; + if(n <= HI_NODE_REGISTERED) begin + logic [NODE_WIDTH-1:0] S = 'x; + always_ff @(posedge clk) begin + if(rst) S <= 'x; + else if(en) S <= s; + end + assign tree[n] = S; + end + else assign tree[n] = s; + end + end : genTree - if(i == 1) assign up4 = Lo4; - else assign lo4 = Lo4; - end : blkLo - else begin : blkLoZero - assign lo4 = '0; - end : blkLoZero + assign lo4[i] = tree[0]; + end : genLo - end + end : genLanes // Stage #5: Resolve lane totals - logic signed [1:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + logic signed [1:0][ACCU_WIDTH-1:0] Res5 = '{ default: 'x }; always_ff @(posedge clk) begin - if(rst) Res5 <= '{ default: 0 }; + if(rst) Res5 <= '{ default: 'x }; else if(en) begin - Res5[1] <= up4 - hi4; - Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 }); + Res5[1] <= $signed({ hi4[1], {(lo_width(1)){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; + Res5[0] <= $signed({ hi4[0], {(lo_width(0)){1'b0}} }) + $signed({ 1'b0, lo4[0] }); end end diff --git a/finn-rtllib/mvu/mvu_pkg.sv b/finn-rtllib/mvu/mvu_pkg.sv deleted file mode 100644 index fa613b2aae..0000000000 --- a/finn-rtllib/mvu/mvu_pkg.sv +++ /dev/null @@ -1,22 +0,0 @@ -package mvu_pkg; - function int unsigned mvu_pipeline_depth( - input string core, - input int unsigned simd = 0, - input int unsigned seglen = 0 - ); - unique case(core) - "mvu_vvu_8sx9_dsp58": begin - automatic int chainlen = (simd+2)/3; - if(seglen == 0) seglen = chainlen; - return 3 + (chainlen-1)/seglen; - end - "mvu_4sx4u", "mvu_4sx4u_dsp48e1", "mvu_4sx4u_dsp48e2", - "mvu_8sx8u_dsp48": - return 5; - default: begin - $error("Unknown MVU core '%s'", core); - $finish; - end - endcase - endfunction : mvu_pipeline_depth -endpackage : mvu_pkg diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv index 7515b2e868..11bf4e9ccd 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -61,8 +61,6 @@ module mvu_vvu_8sx9_dsp58 #( output logic vld, output logic [PE-1:0][ACCU_WIDTH-1:0] p ); - import mvu_pkg::*; - // for verilator always use behavioral code localparam bit BEHAVIORAL = `ifdef VERILATOR @@ -70,6 +68,25 @@ module mvu_vvu_8sx9_dsp58 #( `endif FORCE_BEHAVIORAL; + //----------------------------------------------------------------------- + // Startup Recovery Watchdog + // The DSP slice needs 100ns of recovery time after initial startup before + // being able to ingest input properly. This watchdog discovers violating + // stimuli during simulation and produces a corresponding warning. + if(1) begin : blkRecoveryWatch + logic Dirty = 1; + initial begin + #100ns; + Dirty <= 0; + end + + always_ff @(posedge clk) begin + assert(!Dirty || rst || !en || zero) else begin + $warning("%m: Feeding input during DSP startup recovery. Expect functional errors."); + end + end + end : blkRecoveryWatch + //-------------------- Declare global signals --------------------\\ localparam int unsigned CHAINLEN = (SIMD+2)/3; localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length @@ -91,12 +108,6 @@ module mvu_vvu_8sx9_dsp58 #( end end assign vld = L[0]; - initial begin - if(mvu_pipeline_depth("mvu_vvu_8sx9_dsp58", SIMD, SEGMENTLEN) < $bits(L)) begin - $error("%m: Outdated pipeline depth computation."); - $stop; - end - end //-------------------- Shift register for ZERO flag --------------------\\ logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 368c338f37..f8511cd7a4 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -91,7 +91,6 @@ module mvu_vvu_axi #( output logic m_axis_output_tvalid, input logic m_axis_output_tready ); - import mvu_pkg::*; //-------------------- Parameter sanity checks --------------------\\ initial begin @@ -346,10 +345,12 @@ module mvu_vvu_axi #( end : blkDsp if(1) begin : blkOutput - localparam int unsigned PIPELINE_DEPTH = mvu_pipeline_depth(COMPUTE_CORE, SIMD, SEGMENTLEN); - localparam int unsigned MIN_OUT_PERIOD = MW / SIMD; - localparam int unsigned MAX_IN_FLIGHT = 1 + PIPELINE_DEPTH / MIN_OUT_PERIOD; + localparam int unsigned CORE_PIPELINE_DEPTH = + COMPUTE_CORE == "dotp_8sx9_dsp58"? 3 + (SEGMENTLEN == 0? 0 : ((SIMD+2)/3 -1)/SEGMENTLEN) : + /* else */ 3 + $clog2(SIMD+1) + (SIMD == 1); + // This is conservative and could be divided by a guaranteed minimum output interval, e.g. MW/SIMD. + localparam int unsigned MAX_IN_FLIGHT = CORE_PIPELINE_DEPTH; typedef logic [PE-1:0][ACCU_WIDTH-1:0] output_t; logic signed [$clog2(MAX_IN_FLIGHT+1):0] OPtr = '1; // -1 | 0, 1, ..., MAX_IN_FLIGHT