Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into feature/support_resize…
Browse files Browse the repository at this point in the history
…_operator
  • Loading branch information
klassen9 committed Feb 4, 2025
2 parents d2ed4be + 88e207e commit b8e98de
Show file tree
Hide file tree
Showing 50 changed files with 1,571 additions and 407 deletions.
3 changes: 3 additions & 0 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,6 @@ sphinx:
python:
install:
- requirements: docs/requirements.txt

formats:
- pdf
2 changes: 1 addition & 1 deletion docs/finn/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ Which data layout do FINN-generated accelerators use? Big-endian? Little-endian?
If you need to do this manually, first examine how the `FINN PYNQ Python drivers <https://github.com/Xilinx/finn-examples/blob/main/finn_examples/driver.py#L379>`_ do this – notice how the input data is
first reshaped to create the “folded input shape” that reflects the word size of the first layer based on how much it
was parallelized, then data packing is applied to obtain a raw byte array (with some reversals going on) that can be
fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/data_packing.py#L289>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.
fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn/blob/dev/src/finn/util/data_packing.py#L284>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.

Why does FIFO sizing take so long for my network? Is something wrong?
The automatic FIFO sizing in FINN can take quite long. It unfortunately doesn’t really parallelize on multiple cores since
Expand Down
4 changes: 2 additions & 2 deletions fetch-repos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f"
QONNX_COMMIT="2281a777d84aa5cbd7469085c2e534fb4a03ccf9"
FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
Expand Down
144 changes: 72 additions & 72 deletions finn-rtllib/fifo/hdl/Q_srl.v
Original file line number Diff line number Diff line change
Expand Up @@ -184,58 +184,58 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
end // always @ (posedge clock or negedge reset)

always @* begin // - combi always
srlo_ <= 'bx;
shift_en_o_ <= 1'bx;
shift_en_ <= 1'bx;
addr_ <= 'bx;
state_ <= 2'bx;
srlo_ = 'bx;
shift_en_o_ = 1'bx;
shift_en_ = 1'bx;
addr_ = 'bx;
state_ = 2'bx;
case (state)

state_empty: begin // - (empty, will not produce)
if (i_v) begin // - empty & i_v => consume
srlo_ <= i_d;
shift_en_o_ <= 1;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_one;
srlo_ = i_d;
shift_en_o_ = 1;
shift_en_ = 1'bx;
addr_ = 0;
state_ = state_one;
end
else begin // - empty & !i_v => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_empty;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 1'bx;
addr_ = 0;
state_ = state_empty;
end
end

state_one: begin // - (contains one)
if (i_v && o_b) begin // - one & i_v & o_b => consume
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1;
addr_ <= 0;
state_ <= state_more;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 1;
addr_ = 0;
state_ = state_more;
end
else if (i_v && !o_b) begin // - one & i_v & !o_b => cons+prod
srlo_ <= i_d;
shift_en_o_ <= 1;
shift_en_ <= 1;
addr_ <= 0;
state_ <= state_one;
srlo_ = i_d;
shift_en_o_ = 1;
shift_en_ = 1;
addr_ = 0;
state_ = state_one;
end
else if (!i_v && o_b) begin // - one & !i_v & o_b => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_one;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 1'bx;
addr_ = 0;
state_ = state_one;
end
else if (!i_v && !o_b) begin // - one & !i_v & !o_b => produce
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_empty;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 1'bx;
addr_ = 0;
state_ = state_empty;
end
end // case: state_one

Expand All @@ -244,60 +244,60 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
// - (full, will not consume)
// - (full here if depth==2)
if (o_b) begin // - full & o_b => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 0;
addr_ <= addr;
state_ <= state_more;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 0;
addr_ = addr;
state_ = state_more;
end
else begin // - full & !o_b => produce
srlo_ <= srl[addr];
shift_en_o_ <= 1;
shift_en_ <= 0;
// addr_ <= addr-1;
// state_ <= state_more;
addr_ <= addr_zero_ ? 0 : addr-1;
state_ <= addr_zero_ ? state_one : state_more;
srlo_ = srl[addr];
shift_en_o_ = 1;
shift_en_ = 0;
// addr_ = addr-1;
// state_ = state_more;
addr_ = addr_zero_ ? 0 : addr-1;
state_ = addr_zero_ ? state_one : state_more;
end
end
else begin // - (mid: neither empty nor full)
if (i_v && o_b) begin // - mid & i_v & o_b => consume
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1;
addr_ <= addr+1;
state_ <= state_more;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 1;
addr_ = addr+1;
state_ = state_more;
end
else if (i_v && !o_b) begin // - mid & i_v & !o_b => cons+prod
srlo_ <= srl[addr];
shift_en_o_ <= 1;
shift_en_ <= 1;
addr_ <= addr;
state_ <= state_more;
srlo_ = srl[addr];
shift_en_o_ = 1;
shift_en_ = 1;
addr_ = addr;
state_ = state_more;
end
else if (!i_v && o_b) begin // - mid & !i_v & o_b => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 0;
addr_ <= addr;
state_ <= state_more;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 0;
addr_ = addr;
state_ = state_more;
end
else if (!i_v && !o_b) begin // - mid & !i_v & !o_b => produce
srlo_ <= srl[addr];
shift_en_o_ <= 1;
shift_en_ <= 0;
addr_ <= addr_zero_ ? 0 : addr-1;
state_ <= addr_zero_ ? state_one : state_more;
srlo_ = srl[addr];
shift_en_o_ = 1;
shift_en_ = 0;
addr_ = addr_zero_ ? 0 : addr-1;
state_ = addr_zero_ ? state_one : state_more;
end
end // else: !if(addr_full)
end // case: state_more

default: begin
srlo_ <= 'bx;
shift_en_o_ <= 1'bx;
shift_en_ <= 1'bx;
addr_ <= 'bx;
state_ <= 2'bx;
srlo_ = 'bx;
shift_en_o_ = 1'bx;
shift_en_ = 1'bx;
addr_ = 'bx;
state_ = 2'bx;
end // case: default

endcase // case(state)
Expand Down
10 changes: 6 additions & 4 deletions finn-rtllib/mvu/mvu_4sx4u.sv
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
module mvu_4sx4u #(
int unsigned PE,
int unsigned SIMD,
int unsigned WEIGHT_WIDTH,
int unsigned ACTIVATION_WIDTH,
int unsigned ACCU_WIDTH,

int unsigned VERSION = 1, // Version 1 (DSP48E1) *must* commit to NARROW_WEIGHTS
Expand All @@ -49,8 +51,8 @@ module mvu_4sx4u #(
// Input
input logic last,
input logic zero, // ignore current inputs and force this partial product to zero
input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights
input logic [SIMD-1:0][3:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS)
input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] w, // signed weights
input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS)

// Ouput
output logic vld,
Expand Down Expand Up @@ -141,14 +143,14 @@ module mvu_4sx4u #(
for(genvar s = 0; s < SIMD; s++) begin : genSIMD

// Input Lane Assembly
uwire [17:0] bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
uwire [17:0] bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
logic [29:0] aa;
logic [26:0] dd;
logic [ 1:0] xx[3:1];
if(1) begin : blkVectorize
uwire signed [3:0] ww[PE_END - PE_BEG];
for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin
assign ww[pe] = w[PE_BEG + pe][s];
assign ww[pe] = $signed(w[PE_BEG + pe][s]);
if(pe > 0) begin
if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s];
`ifndef VERILATOR
Expand Down
16 changes: 10 additions & 6 deletions finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
module mvu_8sx8u_dsp48 #(
int unsigned PE,
int unsigned SIMD,
int unsigned ACCU_WIDTH,
int unsigned ACTIVATION_WIDTH,
int unsigned WEIGHT_WIDTH,
int unsigned ACTIVATION_WIDTH,
int unsigned ACCU_WIDTH,

int unsigned VERSION = 1,
bit SIGNED_ACTIVATIONS = 0,
Expand Down Expand Up @@ -72,6 +72,10 @@ module mvu_8sx8u_dsp48 #(
return res;
endfunction : init_leave_loads

function int unsigned sum_width(input int unsigned n, input int unsigned w);
return w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n);
endfunction : sum_width

// Pipeline for last indicator flag
logic [1:5] L = '0;
always_ff @(posedge clk) begin
Expand Down Expand Up @@ -445,7 +449,7 @@ module mvu_8sx8u_dsp48 #(
// Stage #4: Cross-SIMD Reduction

// Count leaves reachable from each node
localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop

// Range of Cross-lane Contribution Tracked in Hi4
/*
Expand All @@ -462,7 +466,7 @@ module mvu_8sx8u_dsp48 #(
* signed value is determined by its lower bound to be at least:
* 1 + $clog2(2^(w-1)+SIMD)
*/
localparam int unsigned HI_WIDTH = 1 + $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD);
localparam int unsigned HI_WIDTH = 1 + ($clog2(SIMD) < ACCU_WIDTH-D[1]? ACCU_WIDTH-D[1] : $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD));

uwire signed [ACCU_WIDTH -1:0] up4;
uwire signed [HI_WIDTH -1:0] hi4;
Expand Down Expand Up @@ -504,12 +508,12 @@ module mvu_8sx8u_dsp48 #(
// Conclusive low part accumulation
if(i >= PE_REM) begin : blkLo
// Adder Tree across all SIMD low contributions (all unsigned arithmetic)
localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
localparam int unsigned ROOT_WIDTH = sum_width(SIMD, LO_WIDTH);
uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree;
for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
for(genvar n = 0; n < SIMD-1; n++) begin
// Sum truncated to actual maximum bit width at this node
localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
localparam int unsigned NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH);
uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2];
assign tree[n] = s;
end
Expand Down
7 changes: 4 additions & 3 deletions finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ module mvu_vvu_8sx9_dsp58 #(
bit IS_MVU,
int unsigned PE,
int unsigned SIMD,
int unsigned ACTIVATION_WIDTH,
int unsigned WEIGHT_WIDTH,
int unsigned ACCU_WIDTH,
int unsigned WEIGHT_WIDTH,
int unsigned ACTIVATION_WIDTH,
int unsigned ACCU_WIDTH,

bit SIGNED_ACTIVATIONS = 0,
int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
bit FORCE_BEHAVIORAL = 0,
Expand Down
23 changes: 16 additions & 7 deletions finn-rtllib/mvu/mvu_vvu_axi.sv
Original file line number Diff line number Diff line change
Expand Up @@ -300,17 +300,22 @@ module mvu_vvu_axi #(

case(COMPUTE_CORE)
"mvu_vvu_8sx9_dsp58":
mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
mvu_vvu_8sx9_dsp58 #(
.IS_MVU(IS_MVU),
.PE(PE), .SIMD(DSP_SIMD),
.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
.FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
) core (
.clk(dsp_clk), .rst, .en(dsp_en),
.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
.vld(dsp_vld), .p(dsp_p)
);
"mvu_4sx4u_dsp48e1":
mvu_4sx4u #(
.PE(PE), .SIMD(DSP_SIMD),
.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
.VERSION(1), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
) core (
.clk(dsp_clk), .rst, .en(dsp_en),
Expand All @@ -320,16 +325,20 @@ module mvu_vvu_axi #(
"mvu_4sx4u_dsp48e2":
mvu_4sx4u #(
.PE(PE), .SIMD(DSP_SIMD),
.ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .NARROW_WEIGHTS(NARROW_WEIGHTS),
.VERSION(2), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
) core (
.clk(dsp_clk), .rst, .en(dsp_en),
.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
.vld(dsp_vld), .p(dsp_p)
);
"mvu_8sx8u_dsp48":
mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
mvu_8sx8u_dsp48 #(
.PE(PE), .SIMD(DSP_SIMD),
.WEIGHT_WIDTH(WEIGHT_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
) core (
.clk(dsp_clk), .rst, .en(dsp_en),
.last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
.vld(dsp_vld), .p(dsp_p)
Expand Down
Loading

0 comments on commit b8e98de

Please sign in to comment.