From 4e0df8f045b7d74ada29e9852bf4f91bc39e297c Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 10 Nov 2023 18:02:35 +0000 Subject: [PATCH 001/291] [CustomOp-Refactor] Initial draft of splitting HLSCustomOp class in HWCustomOp with HLS and RTL Backend using FMPadding --- src/finn/custom_op/fpgadataflow/__init__.py | 6 +- src/finn/custom_op/fpgadataflow/fmpadding.py | 164 ++++++ .../custom_op/fpgadataflow/hls/__init__.py | 35 ++ .../fpgadataflow/hls/fmpadding_hls.py | 291 +++++++++++ src/finn/custom_op/fpgadataflow/hlsbackend.py | 419 +++++++++++++++ src/finn/custom_op/fpgadataflow/hwcustomop.py | 481 ++++++++++++++++++ .../custom_op/fpgadataflow/rtl/__init__.py | 35 ++ .../fpgadataflow/rtl/fmpadding_rtl.py | 257 ++++++++++ src/finn/custom_op/fpgadataflow/rtlbackend.py | 61 +++ .../fpgadataflow/specialize_layers.py | 71 +++ 10 files changed, 1819 insertions(+), 1 deletion(-) create mode 100644 src/finn/custom_op/fpgadataflow/fmpadding.py create mode 100644 src/finn/custom_op/fpgadataflow/hls/__init__.py create mode 100644 src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py create mode 100644 src/finn/custom_op/fpgadataflow/hlsbackend.py create mode 100644 src/finn/custom_op/fpgadataflow/hwcustomop.py create mode 100644 src/finn/custom_op/fpgadataflow/rtl/__init__.py create mode 100644 src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py create mode 100644 src/finn/custom_op/fpgadataflow/rtlbackend.py create mode 100644 src/finn/transformation/fpgadataflow/specialize_layers.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 56d4230a3a..ce05998fcc 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020-2022, Xilinx, Inc. +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -42,6 +43,7 @@ from finn.custom_op.fpgadataflow.downsampler import DownSampler from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise +from finn.custom_op.fpgadataflow.fmpadding import FMPadding from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch @@ -93,3 +95,5 @@ custom_op["CheckSum"] = CheckSum custom_op["StreamingEltwise"] = StreamingEltwise custom_op["FMPadding_rtl"] = FMPadding_rtl + +custom_op["FMPadding"] = FMPadding diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding.py new file mode 100644 index 0000000000..0324984c3f --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/fmpadding.py @@ -0,0 +1,164 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class FMPadding(HWCustomOp): + """Abstraction layer for HW impplementation of FMPadding. + Pads input image by given amount.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # spatial size of input images + "ImgDim": ("ints", True, []), # [H, W] = [Y, X] + # total padding (per dimension) to apply + "Padding": ( + "ints", + True, + [1, 1, 1, 1], + ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end] + # number of channels in input image + "NumChannels": ("i", True, 0), + # SIMD Input parallelism + "SIMD": ("i", False, 1), + # FINN input datatype + "inputDataType": ("s", True, ""), + # shape describing input vecs per execution + "numInputVectors": ("i", False, 1), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_padded_odim(self): + "Return the padded spatial size of the output." + idim_h, idim_w = self.get_nodeattr("ImgDim") + pad = self.get_nodeattr("Padding") + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + odim_h = idim_h + pad_h + odim_w = idim_w + pad_w + return [odim_h, odim_w] + + def get_exp_cycles(self): + odim_h, odim_w = self.get_padded_odim() + channels = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + batch_size = self.get_nodeattr("numInputVectors") + exp_cycles = (channels / simd) * batch_size * odim_h * odim_w + return int(exp_cycles) + + def get_normal_input_shape(self, ind=0): + idim_h, idim_w = self.get_nodeattr("ImgDim") + num_ch = self.get_nodeattr("NumChannels") + ishape = (1, idim_h, idim_w, num_ch) + return ishape + + def get_normal_output_shape(self, ind=0): + odim_h, odim_w = self.get_padded_odim() + num_ch = self.get_nodeattr("NumChannels") + + oshape = (1, odim_h, odim_w, num_ch) + return oshape + + def get_folded_input_shape(self, ind=0): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_ishape[-1] / simd) + folded_ishape = normal_ishape[:-1] + [fold, simd] + return tuple(folded_ishape) + + def get_folded_output_shape(self, ind=0): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_oshape[-1] / simd) + folded_oshape = normal_oshape[:-1] + [fold, simd] + return tuple(folded_oshape) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for FMPadding." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + ret = DataType[self.get_nodeattr("inputDataType")] + # the hlslib op always pads with zeros, so ensure that the DataType + # is able to represent zeros + assert ret.allowed(0), "FMPadding_Batch DataType must support zero" + return ret + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output. (Same as input datatype)""" + return self.get_input_datatype() + + def get_instream_width(self, ind=0): + ibits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + return ibits * simd + + def get_outstream_width(self, ind=0): + obits = self.get_output_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + return obits * simd + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def execute_node(self, context, graph): + pass diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py new file mode 100644 index 0000000000..f381639fba --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -0,0 +1,35 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls + +custom_op = dict() + +# make sure new HLSCustomOp subclasses are imported here so that they get +# registered and plug in correctly into the infrastructure +custom_op["FMPadding_hls"] = FMPadding_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py new file mode 100644 index 0000000000..3b0b870e23 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py @@ -0,0 +1,291 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.fmpadding import FMPadding +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class FMPadding_hls(FMPadding, HLSBackend): + """Corresponds to finn-hlslib FMPadding_Batch function. + Pads input image by given amount.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(FMPadding.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + idim_h, idim_w = self.get_nodeattr("ImgDim") + odim_h, odim_w = self.get_padded_odim() + pad = self.get_nodeattr("Padding") + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + is_square_img = idim_h == idim_w + is_square_pad = pad_h == pad_w + + if is_square_img and is_square_pad: + self.code_gen_dict["$DEFINES$"] = [ + """#define ImgDim1 {}\n#define OutputDim1 {}\n + #define PaddingBefore1 {}\n#define PaddingBehind1 {}\n + #define NumChannels1 {}\n#define SIMD1 {}\n + #define numReps {}\n""".format( + idim_h, + odim_h, + pad[0], + pad[2], + self.get_nodeattr("NumChannels"), + self.get_nodeattr("SIMD"), + self.get_nodeattr("numInputVectors"), + ) + ] + else: + self.code_gen_dict["$DEFINES$"] = [ + """ + #define OutputDim1_x {}\n + #define OutputDim1_y {}\n + #define PaddingLeft1 {}\n + #define PaddingRight1 {}\n + #define PaddingTop1 {}\n + #define PaddingBottom1 {}\n + #define NumChannels1 {}\n + #define SIMD1 {}\n + #define numReps {}\n + """.format( + odim_w, + odim_h, + pad[1], + pad[3], + pad[0], + pad[2], + self.get_nodeattr("NumChannels"), + self.get_nodeattr("SIMD"), + self.get_nodeattr("numInputVectors"), + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + in_t = self.get_input_datatype().get_hls_datatype_str() + idim_h, idim_w = self.get_nodeattr("ImgDim") + pad = self.get_nodeattr("Padding") + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + is_square_img = idim_h == idim_w + is_square_pad = pad_h == pad_w + + if is_square_img and is_square_pad: + hls_call = "FMPadding_Batch" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} (in0_{}, out_{}, numReps);""".format( + hls_call, in_t, self.hls_sname(), self.hls_sname() + ) + ] + else: + hls_call = "FMPadding_nonsquare_Batch" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} (in0_{}, out_{}, numReps);""".format( + hls_call, in_t, self.hls_sname(), self.hls_sname() + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim_H, OutputDim_W, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py new file mode 100644 index 0000000000..403b992a05 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -0,0 +1,419 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import subprocess +from abc import ABC, abstractmethod +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow import templates +from finn.util.basic import CppBuilder, get_rtlsim_trace_depth, make_build_dir +from finn.util.hls import CallHLS +from finn.util.pyverilator import make_single_source_file + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class HLSBackend(ABC): + """HLSBackend class all custom ops that correspond to a finn-hlslib + function are using functionality of. Contains different functions every HLS + custom node should have. Some as abstract methods, these have to be filled + when writing a new HLS custom op node.""" + + def get_nodeattr_types(self): + return { + "code_gen_dir_cppsim": ("s", False, ""), + "executable_path": ("s", False, ""), + "res_hls": ("s", False, ""), + } + + def get_all_verilog_paths(self): + "Return list of all folders containing Verilog code for this node." + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + assert ( + code_gen_dir != "" + ), """Node attribute "code_gen_dir_ipgen" is + not set. Please run HLSSynthIP first.""" + verilog_path = "{}/project_{}/sol1/impl/verilog/".format(code_gen_dir, self.onnx_node.name) + # default impl only returns the HLS verilog codegen dir + return [verilog_path] + + def get_all_verilog_filenames(self, abspath=False): + "Return list of all Verilog files used for this node." + + verilog_files = [] + verilog_paths = self.get_all_verilog_paths() + for verilog_path in verilog_paths: + for f in os.listdir(verilog_path): + if f.endswith(".v"): + if abspath: + verilog_files += [verilog_path + "/" + f] + else: + verilog_files += [f] + return verilog_files + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + verilog_files = self.get_all_verilog_filenames(abspath=True) + single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_") + tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") + target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v" + make_single_source_file(verilog_files, target_file) + + # build the Verilator emu library + sim = PyVerilator.build( + self.get_verilog_top_module_name() + ".v", + build_dir=tmp_build_dir, + verilog_path=[single_src_dir], + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def code_generation_ipgen(self, model, fpgapart, clk): + """Generates c++ code and tcl script for ip generation.""" + node = self.onnx_node + + # generate top cpp file for ip generation + path = self.get_nodeattr("code_gen_dir_ipgen") + self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] + self.generate_params(model, path) + self.global_includes() + self.defines("ipgen") + self.blackboxfunction() + self.pragmas() + self.docompute() + + template = templates.ipgen_template + + for key in self.code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(self.code_gen_dict[key]) + template = template.replace(key, code_gen_line) + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + f = open(os.path.join(code_gen_dir, "top_{}.cpp".format(node.name)), "w") + f.write(template) + f.close() + self.code_gen_dict.clear() + + # generate tcl script for ip generation + self.code_gen_dict["$PROJECTNAME$"] = ["project_{}".format(node.name)] + self.code_gen_dict["$HWSRCDIR$"] = [code_gen_dir] + self.code_gen_dict["$FPGAPART$"] = [fpgapart] + self.code_gen_dict["$TOPFXN$"] = [node.name] + self.code_gen_dict["$CLKPERIOD$"] = [str(clk)] + self.code_gen_dict["$DEFAULT_DIRECTIVES$"] = self.ipgen_default_directives() + self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives() + + template = templates.ipgentcl_template + + for key in self.code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(self.code_gen_dict[key]) + template = template.replace(key, code_gen_line) + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + f = open(os.path.join(code_gen_dir, "hls_syn_{}.tcl".format(node.name)), "w") + f.write(template) + f.close() + self.code_gen_dict.clear() + + def ipgen_default_directives(self): + """Return list of default HLS synthesis directives""" + + default_directives = [ + "set_param hls.enable_hidden_option_error false", + "config_compile -disable_unroll_code_size_check -pipeline_style flp", + "config_interface -m_axi_addr64", + "config_rtl -module_auto_prefix", + "config_rtl -deadlock_detection none", + ] + return default_directives + + def ipgen_extra_directives(self): + "Return a list of extra tcl directives for HLS synthesis." + return [] + + def ipgen_singlenode_code(self): + """Builds the bash script for IP generation using the CallHLS utility.""" + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + builder = CallHLS() + builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name)) + builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name)) + builder.build(code_gen_dir) + ipgen_path = builder.ipgen_path + assert os.path.isdir(ipgen_path), "IPGen failed: %s not found" % (ipgen_path) + self.set_nodeattr("ipgen_path", ipgen_path) + ip_path = ipgen_path + "/sol1/impl/ip" + assert os.path.isdir(ip_path), "IPGen failed: %s not found. Check log under %s" % ( + ip_path, + code_gen_dir, + ) + self.set_nodeattr("ip_path", ip_path) + vlnv = "xilinx.com:hls:%s:1.0" % node.name + self.set_nodeattr("ip_vlnv", vlnv) + + def code_generation_cppsim(self, model): + """Generates c++ code for simulation (cppsim).""" + node = self.onnx_node + path = self.get_nodeattr("code_gen_dir_cppsim") + self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] + self.generate_params(model, path) + self.global_includes() + self.defines("cppsim") + self.read_npy_data() + self.strm_decl() + self.pragmas() + self.docompute() + self.dataoutstrm() + self.save_as_npy() + + template = templates.docompute_template + + for key in self.code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(self.code_gen_dict[key]) + template = template.replace(key, code_gen_line) + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w") + f.write(template) + f.close() + self.code_gen_dict.clear() + + def code_generation_ipi(self): + """Constructs and returns the TCL for node instantiation in Vivado IPI.""" + vlnv = self.get_nodeattr("ip_vlnv") + cmd = ["create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)] + return cmd + + def compile_singlenode_code(self): + """Builds the bash script for compilation using the CppBuilder from + finn.util.basic and executes the script to produce the executable.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + builder = CppBuilder() + # to enable additional debug features please uncommand the next line + # builder.append_includes("-DDEBUG") + builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp") + builder.append_includes("-I$FINN_ROOT/deps/cnpy/") + builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib") + builder.append_includes("-I$FINN_ROOT/custom_hls") + builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"])) + builder.append_includes("--std=c++14") + builder.append_includes("-O3") + builder.append_sources(code_gen_dir + "/*.cpp") + builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp") + builder.append_includes("-lz") + builder.set_executable_path(code_gen_dir + "/node_model") + builder.build(code_gen_dir) + self.set_nodeattr("executable_path", builder.executable_path) + + def dynamic_input_to_npy(self, context, count, target_dir=""): + """Saves input (given context) into .npy files. + + Count indicates the number of inputs that have to be saved.""" + node = self.onnx_node + if target_dir == "": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + if code_gen_dir == "": + raise Exception( + """ + Found no codegen dir for this node, did you run the prepare_cppsim transformation? + """ + ) + target_dir = code_gen_dir + # create a npy file for each input of the node (in_ind is input index) + # assuming dynamic inputs start from 0 + for in_ind in range(count): + current_input_name = node.input[in_ind] + input_array = context[current_input_name] + if in_ind == 0: + expected_inp_shape = self.get_folded_input_shape() + idt = self.get_input_datatype() + else: + expected_inp_shape = self.get_folded_input_shape(in_ind) + idt = self.get_input_datatype(in_ind) + reshaped_input = input_array.reshape(expected_inp_shape) + if idt == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(target_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + + def npy_to_dynamic_output(self, context): + """Reads the output from an output.npy file generated from cppsim and + places its content into the context dictionary.""" + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + output = np.load("{}/output.npy".format(code_gen_dir)) + exp_shape = self.get_normal_output_shape() + context[node.output[0]] = output.reshape(exp_shape) + + def npy_to_dynamic_outputs(self, context, npy_list): + """Reads the output from .npy files generated from cppsim and places + their content into the context dictionary. + npy_list is a list specifying which files to read, and its order must + match the order of node outputs.""" + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + for i in range(len(npy_list)): + output = np.load("{}/{}".format(code_gen_dir, npy_list[i])) + if i == 0: + exp_shape = self.get_normal_output_shape() + else: + exp_shape = self.get_normal_output_shape(i) + context[node.output[i]] = output.reshape(exp_shape) + + def exec_precompiled_singlenode_model(self): + """Executes precompiled executable.""" + executable_path = self.get_nodeattr("executable_path") + if executable_path == "": + raise Exception( + """ +Found no executable for this node, did you run the codegen and +compilation transformations? + """ + ) + process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE) + process_execute.communicate() + + def hls_sname(self): + """Get the naming convention used by Vitis HLS for stream signals + Example: the TDATA for a stream called "out" would be out_V_TDATA. + """ + return "V" + + def execute_node(self, context, graph): + """Executes single node using cppsim or rtlsim.""" + mode = self.get_nodeattr("exec_mode") + if mode == "cppsim": + # save input(s) + self.dynamic_input_to_npy(context, 1) + # execute the precompiled model + self.exec_precompiled_singlenode_model() + # load output npy file + self.npy_to_dynamic_output(context) + elif mode == "rtlsim": + pass + + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + @abstractmethod + def global_includes(self): + """Function to set the global includes for c++ code that has to be generated + for cppsim or rtlsim, is member function of HLSBackend class but has to + be filled by every node.""" + pass + + @abstractmethod + def defines(self, var): + """Function to set the define commands for c++ code that has to be generated + for cppsim or rtlsim, is member function of HLSBackend class but has to + be filled by every node. + + var: makes it possible to reuse the function for different c++ code generation. + I.e. if set to "ipgen" in MatrixVectorActivation additional PRAGMA defines are + added.""" + pass + + @abstractmethod + def read_npy_data(self): + """Function to generate the commands for reading data from .npy file in c++, + is member function of HLSBackend class but has to be filled by every node.""" + pass + + @abstractmethod + def strm_decl(self): + """Function to generate the commands for the stream declaration in c++, + is member function of HLSBackend class but has to be filled + by every node.""" + pass + + @abstractmethod + def docompute(self): + """Function to generate the commands for the computational part of the + c++ code, is member function of HLSBackend class but has to be filled + by every node.""" + pass + + @abstractmethod + def dataoutstrm(self): + """Function to generate the commands for reading out data from c++ and convert + into npy format, is member function of HLSBackend class but has to be filled + by every node.""" + pass + + @abstractmethod + def save_as_npy(self): + """Function to generate the commands for saving data in .npy file in c++, + is member function of HLSBackend class but has to be filled by every node.""" + pass + + @abstractmethod + def blackboxfunction(self): + """Function to generate a blackbock function in c++ from which an IP block + will be generated, is member function of HLSBackend class but has to be filled + by every node.""" + pass + + @abstractmethod + def pragmas(self): + """Function to generate the pragma commands in c++, is member function of + HLSBackend class but has to be filled by every node.""" + pass + + def get_ap_int_max_w(self): + """Return the maximum width of any ap_int used in this module. Used to set the + AP_INT_MAX_W definition for HLS.""" + instream = self.get_instream_width() + outstream = self.get_outstream_width() + ret = max([instream, outstream]) + assert ret <= 32768, "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret + return ret diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py new file mode 100644 index 0000000000..bf89bcc0b4 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py @@ -0,0 +1,481 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import warnings +from abc import abstractmethod +from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io +from qonnx.custom_op.base import CustomOp +from qonnx.util.basic import roundup_to_integer_multiple + +from finn.util.basic import pyverilate_get_liveness_threshold_cycles + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class HWCustomOp(CustomOp): + """HWCustomOp class all custom ops that can be implemented with either + HLS or RTL backend are based on. Contains different functions every fpgadataflow + custom node should have. Some as abstract methods, these have to be filled + when writing a new fpgadataflow custom op node.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + self.code_gen_dict = {} + + def get_nodeattr_types(self): + return { + "backend": ("s", True, "fpgadataflow"), + "preferred_impl_style": ("s", False, "", {"", "hls", "rtl"}), + "code_gen_dir_ipgen": ("s", False, ""), + "ipgen_path": ("s", False, ""), + "ip_path": ("s", False, ""), + "ip_vlnv": ("s", False, ""), + "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim"}), + "cycles_rtlsim": ("i", False, 0), + "cycles_estimate": ("i", False, 0), + "rtlsim_trace": ("s", False, ""), + "res_estimate": ("s", False, ""), + "res_synth": ("s", False, ""), + "rtlsim_so": ("s", False, ""), + # partitioning info + # ID of SLR to which the Op is attached in Vitis builds + # Set to -1 as 'don't care' + "slr": ("i", False, -1), + # Vitis memory port to which any AXI-MM interface + # of this Op should be attached in Vitis builds + # E.g.: "DDR[0]", "HBM[0]", "PLRAM[0]" + "mem_port": ("s", False, ""), + # Partition to which the Op belongs; all Ops with the + # same partition_id are stitched together + # Users should avoid setting this attribute manually + # and instead use the floorplan transform to set + # partition IDs from Vitis design rules and SLR IDs + "partition_id": ("i", False, 0), + # ID of FPGA device to which this Op is allocated, in + # a multi-FPGA setting + "device_id": ("i", False, 0), + # input and output FIFO depths for multi-I/O nodes + "inFIFODepths": ("ints", False, [2]), + "outFIFODepths": ("ints", False, [2]), + "output_hook": ("s", False, ""), + # accumulated characteristic function over two periods + "io_chrc_in": ("t", False, np.asarray([], dtype=np.int32)), + "io_chrc_out": ("t", False, np.asarray([], dtype=np.int32)), + # the period for which the characterization was run + "io_chrc_period": ("i", False, 0), + # amount of zero padding inserted during chrc. + "io_chrc_pads_in": ("ints", False, []), + "io_chrc_pads_out": ("ints", False, []), + } + + def get_verilog_top_module_name(self): + "Return the Verilog top module name for this node." + + node = self.onnx_node + prefixed_top_name = node.name + + return prefixed_top_name + + def get_verilog_top_module_intf_names(self): + """Return a dict of names of input and output interfaces. + The keys reflect the protocols each interface implements: + 'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'. + Values are lists of tuples (axis, aximm) or names (axilite): + 'axis' tuples correspond to the list of node inputs in order, + each tuple is (interface_name, interface_width_bits). + axilite always assumed to be 32 bits and is not tuple (name only). + Each block must have at most one aximm and one axilite.""" + intf_names = {} + intf_names["clk"] = ["ap_clk"] + intf_names["rst"] = ["ap_rst_n"] + sname = self.hls_sname() + intf_names["s_axis"] = [("in0_" + sname, self.get_instream_width_padded())] + intf_names["m_axis"] = [("out_" + sname, self.get_outstream_width_padded())] + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + return intf_names + + def get_verilog_top_filename(self): + "Return the Verilog top module filename for this node." + + verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format( + self.get_nodeattr("code_gen_dir_ipgen"), + self.onnx_node.name, + self.get_verilog_top_module_name(), + ) + return verilog_file + + def get_rtlsim(self): + """Return a PyVerilator wrapper for the Verilator emulation library + for this node.""" + + rtlsim_so = self.get_nodeattr("rtlsim_so") + assert os.path.isfile(rtlsim_so), "Cannot find rtlsim library." + # create PyVerilator wrapper + sim = PyVerilator(rtlsim_so) + return sim + + def node_res_estimation(self): + """Returns summarized resource estimation of BRAMs and LUTs + of the node as a dictionary.""" + ret = dict() + ret["BRAM_18K"] = self.bram_estimation() + ret["BRAM_efficiency"] = self.bram_efficiency_estimation() + ret["LUT"] = self.lut_estimation() + ret["URAM"] = self.uram_estimation() + ret["URAM_efficiency"] = self.uram_efficiency_estimation() + ret["DSP"] = self.dsp_estimation() + return ret + + def bram_efficiency_estimation(self): + """Function for BRAM efficiency estimation: actual parameter storage + needed divided by the allocated BRAM storage (from estimation)""" + return 1 + + def uram_efficiency_estimation(self): + """Function for URAM efficiency estimation: actual parameter storage + needed divided by the allocated URAM storage (from estimation)""" + return 1 + + def bram_estimation(self): + """Function for BRAM resource estimation, is member function of + HLSCustomOp class but has to be filled by every node""" + return 0 + + def uram_estimation(self): + """Function for UltraRAM resource estimation, is member function of + HLSCustomOp class but has to be filled by every node""" + return 0 + + def lut_estimation(self): + """Function for LUT resource estimation, is member function of + HLSCustomOp class but has to be filled by every node""" + return 0 + + def dsp_estimation(self): + """Function for DSP resource estimation, is member function of + HLSCustomOp class but has to be filled by every node""" + return 0 + + def get_exp_cycles(self): + """Function for estimation of expected cycles for set folding, + is member function of HLSCustomOp class but has to be filled + by every node""" + return 0 + + def get_op_and_param_counts(self): + """Return a dictionary with number of ops needed per inference for + this layer as well as parameter count (weights, thresholds, etc.). + Entries should be in the format: + {op_ : , param_: }.""" + return {} + + def reset_rtlsim(self, sim): + """Sets reset input in pyverilator to zero, toggles the clock and set it + back to one""" + sim.io.ap_rst_n = 0 + sim.io.ap_clk = 1 + sim.io.ap_clk = 0 + sim.io.ap_rst_n = 1 + + def toggle_clk(self, sim): + """Toggles the clock input in pyverilator once.""" + sim.io.ap_clk = 1 + sim.io.ap_clk = 0 + + def rtlsim(self, sim, inp, inp2=None): + """Runs the pyverilator simulation by passing the input values to the simulation, + toggle the clock and observing the execution time. Function contains also an + observation loop that can abort the simulation if no output value is produced + after 100 cycles.""" + + trace_file = self.get_nodeattr("rtlsim_trace") + if trace_file != "": + if trace_file == "default": + trace_file = self.onnx_node.name + ".vcd" + sim.start_vcd_trace(trace_file) + inputs = inp + outputs = [] + sname = self.hls_sname() + o_ready = "out_" + sname + "_TREADY" + o_valid = "out_" + sname + "_TVALID" + o_data = "out_" + sname + "_TDATA" + in0_ready = "in0_" + sname + "_TREADY" + in0_valid = "in0_" + sname + "_TVALID" + in0_data = "in0_" + sname + "_TDATA" + in1_ready = "in1_" + sname + "_TREADY" + in1_valid = "in1_" + sname + "_TVALID" + in1_data = "in1_" + sname + "_TDATA" + + sim.io[o_ready] = 1 + + # observe if output is completely calculated + # observation_count will contain the number of cycles the calculation ran + num_out_values = self.get_number_output_values() + output_observed = False + observation_count = 0 + + # avoid infinite looping of simulation by aborting when there is no change in + # output values after 100 cycles + no_change_count = 0 + old_outputs = outputs + liveness_threshold = pyverilate_get_liveness_threshold_cycles() + + while not (output_observed): + sim.io[in0_valid] = 1 if len(inputs) > 0 else 0 + sim.io[in0_data] = inputs[0] if len(inputs) > 0 else 0 + if sim.io[in0_ready] == 1 and sim.io[in0_valid] == 1: + inputs = inputs[1:] + + if inp2 is not None: + sim.io[in1_valid] = 1 if len(inp2) > 0 else 0 + sim.io[in1_data] = inp2[0] if len(inp2) > 0 else 0 + if sim.io[in1_ready] == 1 and sim.io[in1_valid] == 1: + inp2 = inp2[1:] + + if sim.io[o_valid] == 1 and sim.io[o_ready] == 1: + outputs = outputs + [sim.io[o_data]] + sim.io.ap_clk = 1 + sim.io.ap_clk = 0 + + observation_count = observation_count + 1 + no_change_count = no_change_count + 1 + + if len(outputs) == num_out_values: + self.set_nodeattr("cycles_rtlsim", observation_count) + output_observed = True + + if no_change_count == liveness_threshold: + if old_outputs == outputs: + if trace_file != "": + sim.flush_vcd_trace() + sim.stop_vcd_trace() + raise Exception( + "Error in simulation! Takes too long to produce output. " + "Consider setting the LIVENESS_THRESHOLD env.var. to a " + "larger value." + ) + else: + no_change_count = 0 + old_outputs = outputs + if trace_file != "": + sim.flush_vcd_trace() + sim.stop_vcd_trace() + return outputs + + def rtlsim_multi_io(self, sim, io_dict): + "Run rtlsim for this node, supports multiple i/o streams." + + # signal name + sname = "_" + self.hls_sname() + "_" + + trace_file = self.get_nodeattr("rtlsim_trace") + if trace_file == "default": + trace_file = self.onnx_node.name + ".vcd" + num_out_values = self.get_number_output_values() + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + num_out_values, + trace_file=trace_file, + sname=sname, + liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + ) + self.set_nodeattr("cycles_rtlsim", total_cycle_count) + + def generate_params(self, model, path): + """Function to generate parameters (i.e. weights and thresholds), + is member function of HLSCustomOp class but has to be filled + by every node.""" + pass + + @abstractmethod + def get_number_output_values(self): + """Function to get the number of expected output values, + is member function of HLSCustomOp class but has to be filled + by every node.""" + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input stream ind.""" + raise Exception("get_input_datatype not implemented for this op") + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output stream ind.""" + raise Exception("get_output_datatype not implemented for this op") + + def get_normal_input_shape(self, ind=0): + """Returns normal input shape if implemented.""" + raise Exception("get_normal_input_shape not implemented for this op") + + def get_normal_output_shape(self, ind=0): + """Returns folded output shape if implemented.""" + raise Exception("get_normal_output_shape not implemented for this op") + + def get_folded_input_shape(self, ind=0): + """Returns folded input shape (according to synapse folding), if implemented.""" + raise Exception("get_folded_input_shape not implemented for this op") + + def get_folded_output_shape(self, ind=0): + """Returns folded output shape (according to neuron folding), if implemented.""" + raise Exception("get_folded_output_shape not implemented for this op") + + def get_instream_width(self, ind=0): + """Returns input stream width, if implemented.""" + raise Exception("get_instream_width not implemented for this op") + + def get_outstream_width(self, ind=0): + """Returns output stream width, if implemented.""" + raise Exception("get_outstream_width not implemented for this op") + + def get_instream_width_padded(self, ind=0): + """Returns input stream width padded to a multiple of 8. This is required + by the AXI Stream spec.""" + in_width = self.get_instream_width(ind=ind) + return roundup_to_integer_multiple(in_width, 8) + + def get_outstream_width_padded(self, ind=0): + """Returns output stream width padded to a multiple of 8. This is required + by the AXI Stream spec.""" + out_width = self.get_outstream_width(ind=ind) + return roundup_to_integer_multiple(out_width, 8) + + def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): + """Return the unconstrained characteristic functions for this node.""" + # ensure rtlsim is ready + assert self.get_nodeattr("rtlsim_so") != "", "rtlsim not ready for " + self.onnx_node.name + if self.get_nodeattr("io_chrc_period") > 0: + warnings.warn("Skipping node %s: already has FIFO characteristic" % self.onnx_node.name) + return + exp_cycles = self.get_exp_cycles() + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + n_outs = np.prod(self.get_folded_output_shape()[:-1]) + if exp_cycles == 0: + # try to come up with an optimistic estimate + exp_cycles = min(n_inps, n_outs) + assert ( + exp_cycles <= period + ), "Period %d too short to characterize %s : expects min %d cycles" % ( + period, + self.onnx_node.name, + exp_cycles, + ) + sim = self.get_rtlsim() + # signal name + sname = "_" + self.hls_sname() + "_" + if override_rtlsim_dict is not None: + io_dict = override_rtlsim_dict + else: + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + + # extra dicts to keep track of cycle-by-cycle transaction behavior + # note that we restrict key names to filter out weight streams etc + txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key} + txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out" in key} + + def monitor_txns(sim_obj): + for inp in txns_in: + in_ready = _read_signal(sim, inp + sname + "TREADY") == 1 + in_valid = _read_signal(sim, inp + sname + "TVALID") == 1 + if in_ready and in_valid: + txns_in[inp].append(1) + else: + txns_in[inp].append(0) + for outp in txns_out: + if ( + _read_signal(sim, outp + sname + "TREADY") == 1 + and _read_signal(sim, outp + sname + "TVALID") == 1 + ): + txns_out[outp].append(1) + else: + txns_out[outp].append(0) + + reset_rtlsim(sim) + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + n_outs, + sname=sname, + liveness_threshold=period, + hook_preclk=monitor_txns, + ) + assert ( + total_cycle_count <= period + ), """Total cycle count from rtl simulation is higher than + specified period, please set the period higher than {}""".format( + total_cycle_count + ) + self.set_nodeattr("io_chrc_period", period) + + def accumulate_char_fxn(chrc): + p = len(chrc) + ret = [] + for t in range(2 * p): + if t == 0: + ret.append(chrc[0]) + else: + ret.append(ret[-1] + chrc[t % p]) + return np.asarray(ret, dtype=np.int32) + + all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32) + all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32) + all_pad_in = [] + all_pad_out = [] + for in_idx, in_strm_nm in enumerate(txns_in.keys()): + txn_in = txns_in[in_strm_nm] + if len(txn_in) < period: + pad_in = period - len(txn_in) + txn_in += [0 for x in range(pad_in)] + txn_in = accumulate_char_fxn(txn_in) + all_txns_in[in_idx, :] = txn_in + all_pad_in.append(pad_in) + + for out_idx, out_strm_nm in enumerate(txns_out.keys()): + txn_out = txns_out[out_strm_nm] + if len(txn_out) < period: + pad_out = period - len(txn_out) + txn_out += [0 for x in range(pad_out)] + txn_out = accumulate_char_fxn(txn_out) + all_txns_out[out_idx, :] = txn_out + all_pad_out.append(pad_out) + + self.set_nodeattr("io_chrc_in", all_txns_in) + self.set_nodeattr("io_chrc_out", all_txns_out) + self.set_nodeattr("io_chrc_pads_in", all_pad_in) + self.set_nodeattr("io_chrc_pads_out", all_pad_out) diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py new file mode 100644 index 0000000000..7c9b2eaf22 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -0,0 +1,35 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from finn.custom_op.fpgadataflow.rtl.fmpadding_rtl import FMPadding_rtl + +custom_op = dict() + +# make sure new HLSCustomOp subclasses are imported here so that they get +# registered and plug in correctly into the infrastructure +custom_op["FMPadding_rtl"] = FMPadding_rtl diff --git a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py new file mode 100644 index 0000000000..3c8a1ad777 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py @@ -0,0 +1,257 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import shutil +from qonnx.util.basic import roundup_to_integer_multiple + +from finn.custom_op.fpgadataflow.fmpadding import FMPadding +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class FMPadding_rtl(FMPadding, RTLBackend): + """CustomOp wrapper for the finn-rtllib fmpadding_axi component + Supports adjusting the padding amount and spatial feature sizes at + runtime.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # Enable reprogrammable implementation to change FM dimensions, + # stride, or dilation during runtime + "dynamic_mode": ("i", False, 0, {0, 1}), + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), + } + my_attrs.update(FMPadding.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def get_verilog_top_module_intf_names(self): + # Overload default HLSCustomOp implementation to add axilite control IF + intf_names = super().get_verilog_top_module_intf_names() + if self.get_nodeattr("dynamic_mode"): + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + raise Exception("cppsim not possible for FMPadding_rtl, please set exec_mode to rtlsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim_H, OutputDim_W, NumChannels).""" + + def get_template_values(self, ifm_dims, pads, chans, simd, idt): + dimY, dimX = ifm_dims + padT, padL, padB, padR = pads + y_counter_bits = int(math.ceil(math.log2(padT + dimY + padB + 1))) + x_counter_bits = int(math.ceil(math.log2(padL + dimX + padR + 1))) + topname = self.get_verilog_top_module_name() + stream_bits = idt.bitwidth() * simd + stream_bits = int(roundup_to_integer_multiple(stream_bits, 8)) + code_gen_dict = { + "XCOUNTER_BITS": int(x_counter_bits), + "YCOUNTER_BITS": int(y_counter_bits), + "NUM_CHANNELS": int(chans), + "SIMD": int(simd), + "ELEM_BITS": idt.bitwidth(), + "TOP_MODULE_NAME": topname, + "INIT_XON": int(padL), + "INIT_XOFF": int(padL + dimX), + "INIT_XEND": int(padL + dimX + padR - 1), + "INIT_YON": int(padT), + "INIT_YOFF": int(padT + dimY), + "INIT_YEND": int(padT + dimY + padB - 1), + "STREAM_BITS": int(stream_bits), + } + return code_gen_dict + + def get_dynamic_config(self, ifm_dims=None, pads=None): + """Returns a configuration dict to re-configure FM dimension and + padding amounts during runtime.""" + + if ifm_dims is None: + ifm_dims = self.get_nodeattr("ImgDim") + if pads is None: + pads = self.get_nodeattr("Padding") + chans = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + idt = self.get_input_datatype() + code_gen_dict = self.get_template_values(ifm_dims, pads, chans, simd, idt) + config = { + "XON": (0 * 4, (code_gen_dict["INIT_XON"])), + "XOFF": (1 * 4, (code_gen_dict["INIT_XOFF"])), + "XEND": (2 * 4, (code_gen_dict["INIT_XEND"])), + "YON": (3 * 4, (code_gen_dict["INIT_YON"])), + "YOFF": (4 * 4, (code_gen_dict["INIT_YOFF"])), + "YEND": (5 * 4, (code_gen_dict["INIT_YEND"])), + } + return config + + def generate_hdl(self): + rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fmpadding/hdl" + template_path = rtlsrc + "/fmpadding_template.v" + dims = self.get_nodeattr("ImgDim") + pads = self.get_nodeattr("Padding") + chans = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + idt = self.get_input_datatype() + code_gen_dict = self.get_template_values(dims, pads, chans, simd, idt) + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + for key_name in code_gen_dict: + key = "$%s$" % key_name + template = template.replace(key, str(code_gen_dict[key_name])) + + with open( + os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"), + "w", + ) as f: + f.write(template) + + sv_files = ["fmpadding_axi.sv", "fmpadding.sv", "axi2we.sv"] + for sv_file in sv_files: + shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir) + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + # Modified to use generated (System-)Verilog instead of HLS output products + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [ + "fmpadding_axi.sv", + "fmpadding.sv", + "axi2we.sv", + self.get_nodeattr("gen_top_module") + ".v", + ] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def code_generation_ipi(self): + """Constructs and returns the TCL for node instantiation in Vivado IPI.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + sourcefiles = [ + "fmpadding_axi.sv", + "fmpadding.sv", + "axi2we.sv", + self.get_nodeattr("gen_top_module") + ".v", + ] + + sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles] + + cmd = [] + for f in sourcefiles: + cmd += ["add_files -norecurse %s" % (f)] + cmd += [ + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ] + return cmd diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py new file mode 100644 index 0000000000..4c1977852c --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -0,0 +1,61 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from abc import ABC, abstractmethod + + +class RTLBackend(ABC): + """RTLBackend class all custom ops that correspond to a module in finn-rtllib + are using functionality of. Contains different functions every RTL + custom node should have. Some as abstract methods, these have to be filled + when writing a new RTL custom op node.""" + + def get_nodeattr_types(self): + return {} + + @abstractmethod + def generate_hdl(self): + pass + + @abstractmethod + def prepare_rtlsim(self): + pass + + @abstractmethod + def code_generation_ipi(self): + pass + + def code_generation_ipgen(self, model, fpgapart, clk): + self.generate_hdl() + + # TODO: Implement alternative + def hls_sname(self): + """Get the naming convention used by Vitis HLS for stream signals + Example: the TDATA for a stream called "out" would be out_V_TDATA. + """ + return "V" diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py new file mode 100644 index 0000000000..4c926ad9b1 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -0,0 +1,71 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +from onnx import helper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.base import Transformation +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes + + +class SpecializeFMPadding(Transformation): + """Convert FMPadding layer to FMPadding_hls or FMPadding_rtl.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "FMPadding": + pad_input = node.input[0] + pad_output = node.output[0] + pad_inst = getCustomOp(node) + impl_style = pad_inst.get_nodeattr("preferred_impl_style") + if impl_style == "": + impl_style = "rtl" + optype = node.op_type + "_" + impl_style + new_node = helper.make_node( + optype, + [pad_input], + [pad_output], + domain="finn.custom_op.fpgadataflow." + impl_style, + ) + # add all attributes + for attribute in node.attribute: + if attribute.name != "preferred_impl_style": + new_node.attribute.append(attribute) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(node) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) From 61e3de2c0a4afae6b32e8cfbf1ead7258141d260 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 16 Nov 2023 17:44:21 +0000 Subject: [PATCH 002/291] [AddStreams] Initial commit for AddStreams with new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 2 + src/finn/custom_op/fpgadataflow/addstreams.py | 162 ++++++++++ .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../fpgadataflow/hls/addstreams_hls.py | 281 ++++++++++++++++++ .../fpgadataflow/specialize_layers.py | 47 ++- 5 files changed, 493 insertions(+), 1 deletion(-) create mode 100644 src/finn/custom_op/fpgadataflow/addstreams.py create mode 100644 src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index ce05998fcc..c7bf09d0c2 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -27,6 +27,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from finn.custom_op.fpgadataflow.addstreams import AddStreams from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch from finn.custom_op.fpgadataflow.checksum import CheckSum @@ -97,3 +98,4 @@ custom_op["FMPadding_rtl"] = FMPadding_rtl custom_op["FMPadding"] = FMPadding +custom_op["AddStreams"] = AddStreams diff --git a/src/finn/custom_op/fpgadataflow/addstreams.py b/src/finn/custom_op/fpgadataflow/addstreams.py new file mode 100644 index 0000000000..0f1336c6e1 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/addstreams.py @@ -0,0 +1,162 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class AddStreams(HWCustomOp): + """Abstraction layer for HW implementation of AddStreams.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + "inFIFODepths": ("ints", False, [2, 2]), + } + ) + return my_attrs + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich]) + return ishape + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + assert ich % pe == 0, "PE must divide NumChannels" + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich // pe, pe]) + return ishape + + def get_normal_output_shape(self, ind=0): + return self.get_normal_input_shape() + + def get_folded_output_shape(self, ind=0): + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input1 shape." + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) + assert ishape == exp_ishape, "Unexpected input2 shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # enforce output data type (calculated based on idt) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + # we need to set output datatype to the next larger int or uint + # enhancement: consider specifying w/ explicit outputDataType attribute + # to allow overflow and use the same idt if user wants + idt = DataType[self.get_nodeattr("inputDataType")] + if idt.signed(): + return DataType.get_smallest_possible(2 * idt.min()) + else: + return DataType.get_smallest_possible(2 * idt.max()) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + pass + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + sname = self.hls_sname() + swidth = self.get_instream_width_padded() + intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] + return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + "in1": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index f381639fba..f978a8616c 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls custom_op = dict() @@ -33,3 +34,4 @@ # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure custom_op["FMPadding_hls"] = FMPadding_hls +custom_op["AddStreams_hls"] = AddStreams_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py new file mode 100644 index 0000000000..1a40970b77 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py @@ -0,0 +1,281 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.addstreams import AddStreams +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class AddStreams_hls(AddStreams, HLSBackend): + """Class that corresponds to finn-hlslib AddStreams_Batch function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(AddStreams.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required LabelSelect_Batch attributes do not exist.""") + + return info_messages + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == exp_ishape, """Input0 shape doesn't match expected shape .""" + export_idt = self.get_input_datatype() + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + # exact same thing for input1 + inp = context[node.input[1]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == exp_ishape, """Input1 shape doesn't match expected shape .""" + export_idt = self.get_input_datatype() + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_1.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp0 = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + rtlsim_inp1 = npy_to_rtlsim_input( + "{}/input_1.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + self.code_gen_dict["$READNPYDATA$"] = [] + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + npy_in = "%s/input_1.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in1_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in1_{} ("in1_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + hls_call = "AddStreams_Batch" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<{}, {}, {}, {}, {}> (in0_{}, in1_{}, out_{}, 1);""".format( + hls_call, + self.get_nodeattr("PE"), + self.get_input_datatype().get_hls_datatype_str(), + self.get_input_datatype().get_hls_datatype_str(), + self.get_output_datatype().get_hls_datatype_str(), + self.get_number_output_values(), + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, + hls::stream> &out_{})""".format( + self.onnx_node.name, + self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(), + self.hls_sname(), + self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(), + self.hls_sname(), + self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(), + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=in1_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 4c926ad9b1..d45d1dc600 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -26,7 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - +import warnings from onnx import helper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation @@ -69,3 +69,48 @@ def apply(self, model): model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class SpecializeAddStreams(Transformation): + """Convert AddStreams layer to Addstreams_hls. There is no RTL variant of this node""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "AddStreams": + add_input0 = node.input[0] + add_input1 = node.input[1] + add_output = node.output[0] + add_inst = getCustomOp(node) + impl_style = add_inst.get_nodeattr("preferred_impl_style") + if impl_style == "rtl": + warn_str = """There is no RTL variant of %s. Node %s will automatically be + set to HLS variant.""" % ( + node.op_type, + node.name, + ) + warnings.warn(warn_str) + if impl_style == "" or impl_style == "rtl": + impl_style = "hls" + optype = node.op_type + "_" + impl_style + new_node = helper.make_node( + optype, + [add_input0, add_input1], + [add_output], + domain="finn.custom_op.fpgadataflow." + impl_style, + ) + # add all attributes + for attribute in node.attribute: + if attribute.name != "preferred_impl_style": + new_node.attribute.append(attribute) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(node) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) From 7182be4346ea1680d63c711d5fd47719217b7f7d Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 16 Nov 2023 17:46:01 +0000 Subject: [PATCH 003/291] [Tests] Update fmpadding and addstreams tests for new flow --- .../test_fpgadataflow_addstreams.py | 11 ++++++++--- .../fpgadataflow/test_fpgadataflow_fmpadding.py | 17 ++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py index 1ad2c26610..9b9c4a1e85 100644 --- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,6 +44,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeAddStreams def make_addstreams_modelwrapper(ch, pe, idt): @@ -52,7 +53,7 @@ def make_addstreams_modelwrapper(ch, pe, idt): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch]) addstreams_node = helper.make_node( - "AddStreams_Batch", + "AddStreams", ["inp1", "inp2"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -60,6 +61,7 @@ def make_addstreams_modelwrapper(ch, pe, idt): NumChannels=ch, PE=pe, inputDataType=idt.name, + preferred_impl_style="hls", ) graph = helper.make_graph( nodes=[addstreams_node], @@ -103,6 +105,9 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): x2 = gen_finn_dt_tensor(idt, (1, ch)) model = make_addstreams_modelwrapper(ch, pe, idt) + model.save("addstreams_hw.onnx") + model = model.transform(SpecializeAddStreams()) + model.save("addstreams_hls.onnx") if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -130,7 +135,7 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): assert (y_produced == y_expected).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("AddStreams_Batch")[0] + node = model.get_nodes_by_op_type("AddStreams_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index c871811c5e..4a4c46f3c3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020-2022, Xilinx, Inc. +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,6 +47,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeFMPadding from finn.util.basic import pynq_part_map test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -53,7 +55,7 @@ target_clk_ns = 10 -def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt): +def make_single_fmpadding_modelwrapper(impl_style, idim, padding, num_ch, simd, idt): pad_h = padding[0] + padding[2] pad_w = padding[1] + padding[3] idim_h, idim_w = idim @@ -66,7 +68,7 @@ def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, odim_h, odim_w, num_ch]) FMPadding = helper.make_node( - optype, + "FMPadding", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -77,6 +79,7 @@ def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt) inputDataType=str(idt.name), numInputVectors=1, SIMD=simd, + preferred_impl_style=impl_style, ) graph = helper.make_graph( @@ -125,9 +128,8 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): odim_h = idim_h + pad_h odim_w = idim_w + pad_w - optype = {"hls": "FMPadding_Batch", "rtl": "FMPadding_rtl"}[impl_style] - - model = make_single_fmpadding_modelwrapper(optype, idim, pad, num_ch, simd, idt) + model = make_single_fmpadding_modelwrapper(impl_style, idim, pad, num_ch, simd, idt) + model = model.transform(SpecializeFMPadding()) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) @@ -148,7 +150,8 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): assert (y_produced == y_expected).all() if mode == "rtlsim": - node = model.get_nodes_by_op_type(optype)[0] + op_type = "FMPadding_" + impl_style + node = model.get_nodes_by_op_type(op_type)[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) From e742430dee1d0a53d27a7a9c095968a71484d0f5 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 29 Nov 2023 11:47:21 +0000 Subject: [PATCH 004/291] [Transform] Generalize SpecializeLayers tranform --- .../fpgadataflow/specialize_layers.py | 168 +++++++++++------- .../test_fpgadataflow_addstreams.py | 4 +- .../test_fpgadataflow_fmpadding.py | 4 +- 3 files changed, 107 insertions(+), 69 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index d45d1dc600..4b2687faee 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -33,83 +33,121 @@ from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes +from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants +from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants -class SpecializeFMPadding(Transformation): - """Convert FMPadding layer to FMPadding_hls or FMPadding_rtl.""" +restricted_layers = [] +restricted_layers.append("MatrixVectorActivation") +restricted_layers.append("VectorVectorActivation") - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "FMPadding": - pad_input = node.input[0] - pad_output = node.output[0] - pad_inst = getCustomOp(node) - impl_style = pad_inst.get_nodeattr("preferred_impl_style") - if impl_style == "": - impl_style = "rtl" - optype = node.op_type + "_" + impl_style - new_node = helper.make_node( - optype, - [pad_input], - [pad_output], - domain="finn.custom_op.fpgadataflow." + impl_style, + +def _determine_impl_style(node): + optype = node.op_type + + # if rtl variant has specific restrictions + # use always the hls variant for now + if optype in restricted_layers: + return "hls" + + # check if there is an HLS or RTL variant or both + hls_variant = optype + "_hls" in hls_variants.keys() + rtl_variant = optype + "_rtl" in rtl_variants.keys() + + # check if user has specified a preferred_impl_style + inst = getCustomOp(node) + impl_style = inst.get_nodeattr("preferred_impl_style") + + # if impl_style not set, for "simple" layers always try + # to use rtl variant if available + if impl_style == "": + if rtl_variant: + return "rtl" + # but if no rtl variant, set impl_style to hls + elif hls_variant: + return "hls" + # if there is neither an rtl nor hls variant + # throw error + else: + raise Exception( + """Node {} with optype {} has no hw implementation variant)""".format( + node.name, optype ) - # add all attributes - for attribute in node.attribute: - if attribute.name != "preferred_impl_style": - new_node.attribute.append(attribute) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(node) - graph_modified = True - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) + ) + + # check if user setting can be fulfilled + # otherwise change impl_style + if impl_style == "hls": + if hls_variant: + return "hls" + elif rtl_variant: + warn_str = """There is no HLS variant of %s. Node %s will automatically be + set to RTL variant.""" % ( + node.op_type, + node.name, + ) + warnings.warn(warn_str) + return "rtl" + else: + raise Exception( + """Node {} with optype {} has no hw implementation variant)""".format( + node.name, optype + ) + ) + elif impl_style == "rtl": + if rtl_variant: + return "rtl" + elif hls_variant: + warn_str = """There is no RTL variant of %s. Node %s will automatically be + set to HLS variant.""" % ( + node.op_type, + node.name, + ) + warnings.warn(warn_str) + return "hls" + else: + raise Exception( + """Node {} with optype {} has no hw implementation variant)""".format( + node.name, optype + ) + ) + else: + raise Exception( + """Invalid value for attribute preferred_impl_style! Is currently set to: {} + has to be set to one of the following value ("hls", "rtl")""".format( + impl_style + ) + ) -class SpecializeAddStreams(Transformation): - """Convert AddStreams layer to Addstreams_hls. There is no RTL variant of this node""" +class SpecializeLayers(Transformation): + """Specialize all layers to either HLS or RTL variants""" def apply(self, model): graph = model.graph node_ind = 0 graph_modified = False for node in graph.node: + # Skip nodes that are not hw layers + if not node.domain == "finn.custom_op.fpgadataflow": + continue node_ind += 1 - if node.op_type == "AddStreams": - add_input0 = node.input[0] - add_input1 = node.input[1] - add_output = node.output[0] - add_inst = getCustomOp(node) - impl_style = add_inst.get_nodeattr("preferred_impl_style") - if impl_style == "rtl": - warn_str = """There is no RTL variant of %s. Node %s will automatically be - set to HLS variant.""" % ( - node.op_type, - node.name, - ) - warnings.warn(warn_str) - if impl_style == "" or impl_style == "rtl": - impl_style = "hls" - optype = node.op_type + "_" + impl_style - new_node = helper.make_node( - optype, - [add_input0, add_input1], - [add_output], - domain="finn.custom_op.fpgadataflow." + impl_style, - ) - # add all attributes - for attribute in node.attribute: - if attribute.name != "preferred_impl_style": - new_node.attribute.append(attribute) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(node) - graph_modified = True + impl_style = _determine_impl_style(node) + optype = node.op_type + "_" + impl_style + + new_node = helper.make_node( + optype, + node.input, + node.output, + domain="finn.custom_op.fpgadataflow." + impl_style, + ) + # add all attributes + for attribute in node.attribute: + if attribute.name != "preferred_impl_style": + new_node.attribute.append(attribute) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(node) + graph_modified = True if graph_modified: model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py index 9b9c4a1e85..ba3afe9c86 100644 --- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py @@ -44,7 +44,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.fpgadataflow.specialize_layers import SpecializeAddStreams +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_addstreams_modelwrapper(ch, pe, idt): @@ -106,7 +106,7 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): model = make_addstreams_modelwrapper(ch, pe, idt) model.save("addstreams_hw.onnx") - model = model.transform(SpecializeAddStreams()) + model = model.transform(SpecializeLayers()) model.save("addstreams_hls.onnx") if exec_mode == "cppsim": diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 4a4c46f3c3..3717f92e5d 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -47,7 +47,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.fpgadataflow.specialize_layers import SpecializeFMPadding +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import pynq_part_map test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -129,7 +129,7 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): odim_w = idim_w + pad_w model = make_single_fmpadding_modelwrapper(impl_style, idim, pad, num_ch, simd, idt) - model = model.transform(SpecializeFMPadding()) + model = model.transform(SpecializeLayers()) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) From 14d306d74ce383ff38464fb410b3947611b30b2f Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 29 Nov 2023 14:07:38 +0000 Subject: [PATCH 005/291] [CustomOp] Initial draft of channelwise op in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 2 + .../custom_op/fpgadataflow/channelwise_op.py | 200 ++++++++ .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../fpgadataflow/hls/channelwise_op_hls.py | 467 ++++++++++++++++++ .../test_fpgadataflow_channelwise_ops.py | 12 +- 5 files changed, 679 insertions(+), 4 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/channelwise_op.py create mode 100644 src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index c433d83162..8f5ff0ac92 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -29,6 +29,7 @@ from finn.custom_op.fpgadataflow.addstreams import AddStreams from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch +from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch from finn.custom_op.fpgadataflow.checksum import CheckSum from finn.custom_op.fpgadataflow.concat import StreamingConcat @@ -103,3 +104,4 @@ custom_op["FMPadding"] = FMPadding custom_op["AddStreams"] = AddStreams +custom_op["ChannelwiseOp"] = ChannelwiseOp diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op.py b/src/finn/custom_op/fpgadataflow/channelwise_op.py new file mode 100644 index 0000000000..5d1d8febc1 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/channelwise_op.py @@ -0,0 +1,200 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + +# ONNX i/o tensor shape assumptions for channelwise ops: +# input 0 is the input tensor, shape (..., NumChannels) +# input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel) +# output 0 is the output tensor, shape (..., NumChannels) - same as input +# the ... here can be any shape (representing groups of vectors) + + +def get_smallest_possible(vals): + """Returns smallest (fewest bits) possible DataType that can represent + value. Prefers unsigned integers where possible.""" + vals = np.array(vals, dtype=np.float64) + for v in vals: + assert int(v) == v, "Error float value" + + for k in DataType.get_accumulator_dt_cands(): + dt = DataType[k] + + if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: + # not currently supported + continue + + if (dt.min() <= vals).all() and (vals <= dt.max()).all(): + return dt + + warnings.warn( + """InferChannelwiseLinearLayer: Output values may not be + representable with supported data types. + Setting maximum width data type available. + This will lead to errors if there are no constrains on the input + """ + ) + + if (0 <= vals).all(): + return DataType["UINT64"] + else: + return DataType["INT64"] + + +class ChannelwiseOp(HWCustomOp): + """Abstraction layer for HW implementation of ChannelwiseOp.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # channelwise "map" function to apply: + # one of cmp_le, cmp_ge, add, mul + "Func": ("s", False, "cmp_le", {"cmp_le", "cmp_ge", "add", "mul"}), + "PE": ("i", True, 0), + "NumChannels": ("i", True, 0), + # string defining memory resource type for parameters + "ram_style": ("s", False, "distributed", {"distributed", "block"}), + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "paramDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def calc_tmem(self): + """Calculates and returns TMEM, the depth of the memory used + to store the channelwise op parameters.""" + chn = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return chn // pe + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + # implement tensor with correct shape + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # check input datatype against property + idt = model.get_tensor_datatype(node.input[0]) + + exp_idt_name = self.get_nodeattr("inputDataType") + if exp_idt_name != idt.name: + func = self.get_nodeattr("Func") + assert func in ["add", "mul"], "Bad input DataType for ChannelwiseOp layer" + + self.set_nodeattr("inputDataType", idt.name) + # update the func in ['add','mul'] cases + + # get parameter ranges + param = model.get_initializer(node.input[1]) + param_min = min(param.flatten()) + param_max = max(param.flatten()) + + # set function and determine output data type + if func == "add": + out_min = idt.min() + param_min + out_max = idt.max() + param_max + odt = get_smallest_possible([out_min, out_max]) + elif func == "mul": + possible_limits = [] + possible_limits += [idt.min() * param_min] + possible_limits += [idt.min() * param_max] + possible_limits += [idt.max() * param_min] + possible_limits += [idt.max() * param_max] + odt = get_smallest_possible(possible_limits) + + self.set_nodeattr("outputDataType", odt.name) + + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + fold = ich // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [ich]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + pass diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index f978a8616c..b5745c641d 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -27,6 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls +from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls custom_op = dict() @@ -35,3 +36,4 @@ # registered and plug in correctly into the infrastructure custom_op["FMPadding_hls"] = FMPadding_hls custom_op["AddStreams_hls"] = AddStreams_hls +custom_op["ChannelwiseOp_hls"] = ChannelwiseOp_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py new file mode 100644 index 0000000000..d816b6f15a --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py @@ -0,0 +1,467 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from math import ceil +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + rtlsim_output_to_npy, +) + +# ONNX i/o tensor shape assumptions for channelwise ops: +# input 0 is the input tensor, shape (..., NumChannels) +# input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel) +# output 0 is the output tensor, shape (..., NumChannels) - same as input +# the ... here can be any shape (representing groups of vectors) + + +class ChannelwiseOp_hls(ChannelwiseOp, HLSBackend): + """Class that corresponds to finn-hls Thresholding_Batch function. + It can implement a variety of channel-wise parametrized operations, + including Add, Mul and multi-thresholding. + """ + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(ChannelwiseOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("paramDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required Threshold_Batch attributes do not exist.""") + + return info_messages + + def bram_estimation(self): + """Calculates BRAM cost if resource set to BRAM""" + style = self.get_nodeattr("ram_style") + P = self.get_nodeattr("PE") + idt = self.get_input_datatype() + A = idt.bitwidth() + tmem = self.calc_tmem() + + if style == "block" and tmem > 1: + return int(ceil(A * P / 16)) * int(ceil(tmem / 1024)) + else: + return 0 + + def lut_estimation(self): + """Calculates LUT cost, taking memory resource type into account""" + # TODO add in/out FIFO contributions + style = self.get_nodeattr("ram_style") + P = self.get_nodeattr("PE") + idt = self.get_input_datatype() + A = idt.bitwidth() + tmem = self.calc_tmem() + # cost of comparators + comparator_cost = A * P + # cost of LUTRAM + if style == "distributed" and tmem > 1: + lutram_cost = P * A * int(ceil(tmem / 64)) + else: + lutram_cost = 0 + # total cost + return comparator_cost + lutram_cost + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + # fill in TSrcI + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def get_hls_compatible_parameter_tensor(self, orig_param_vector): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure chn % PE == 0 + * interleave rows between PEs + * reshape into (PE, TMEM) and return + """ + chn = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + tmem = chn // pe + assert chn % pe == 0, "Requirement NumChannels divisable by PE is violated." + assert ( + orig_param_vector.ndim == 1 + ), """Parameter vector dimension is {}. + Expected dimension: 1.""".format( + orig_param_vector.ndim + ) + + # if not self.get_input_datatype().signed(): + # # ensure all thresholds are nonnegative + # assert (orig_param_vector >= 0).all() + + # ensure all thresholds are integer + assert (orig_param_vector.astype(np.int32) == orig_param_vector).all() + ret = orig_param_vector + + assert ret.shape[0] == chn, "Cardinality of parameter vector is not as expected (chn)" + + # distribute rows between PEs + ret = ret.reshape(tmem, pe).transpose() + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + + return ret.reshape(1, pe, tmem) + + def generate_params(self, model, path): + code_gen_dir = path + # save thresholds in params.h + parameters = model.get_initializer(self.onnx_node.input[1]) + parameter_tensor = self.get_hls_compatible_parameter_tensor(parameters) + pdt = DataType[self.get_nodeattr("paramDataType")] + + parameters_hls_code = numpy_to_hls_code(parameter_tensor, pdt, "parameters", False, True) + # get input data type + export_idt = self.get_input_datatype() + if self.get_input_datatype() == DataType["BIPOLAR"]: + export_idt = DataType["BINARY"] + idt_hls = export_idt.get_hls_datatype_str() + + # write parameters into params.h + f_params = open("{}/params.h".format(code_gen_dir), "w") + pdt_hls = pdt.get_hls_datatype_str() + # use binary to export bipolar activations + export_odt = self.get_output_datatype() + if self.get_output_datatype() == DataType["BIPOLAR"]: + export_odt = DataType["BINARY"] + odt_hls = export_odt.get_hls_datatype_str() + # get desired function + func = self.get_nodeattr("Func") + if func == "cmp_le": + func_str = "comp::less_equal<%s, %s>" % (idt_hls, pdt_hls) + elif func == "cmp_ge": + func_str = "comp::greater_equal<%s, %s>" % (idt_hls, pdt_hls) + elif func == "add": + func_str = "comp::add<%s, %s, %s>" % (odt_hls, odt_hls, odt_hls) + elif func == "mul": + func_str = "comp::mul<%s, %s, %s>" % (odt_hls, odt_hls, odt_hls) + else: + raise Exception( + """Invalid value for attribute Func! Is currently set to: {} + has to be set to one of the following value + ("cmp_le", "cmp_ge", "add", "mul")""".format( + func + ) + ) + f_params.write( + "static ChannelWiseOperation<{},{},{},{},{},{}> threshs \ + = ".format( + self.calc_tmem(), + self.get_nodeattr("PE"), + idt_hls, + pdt_hls, + odt_hls, + func_str, + ) + ) + f_params.write(parameters_hls_code) + f_params.close() + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for ChannelwiseOp_Batch") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_normal_output_shape() + ), """Output shape is not as expected""" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"'] + + def defines(self, var): + numInputVectors = list(self.get_nodeattr("numInputVectors")) + numReps = numInputVectors[0] + self.code_gen_dict["$DEFINES$"] = [ + """#define NumChannels1 {}\n#define PE1 {}\n#define numReps {}""".format( + self.get_nodeattr("NumChannels"), + self.get_nodeattr("PE"), + numReps, + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + tmpl_args = self.get_template_param_values() + # TODO: why put some template parameters into defines and not others? + # should ImgDim be defined or just filled in here like we do now? + ishape = self.get_folded_input_shape() + if len(ishape) == 3: + spatial_dim = 1 + elif len(ishape) == 5: + spatial_dim = ishape[1] * ishape[2] + else: + raise Exception("""Unexpeted input shape""") + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}> + (in0_{}, out_{}, threshs, numReps);""".format( + spatial_dim, + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + self.hls_sname(), + self.hls_sname(), + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + # the channelwise parameter tensor is acc_type [PE][TMEM][N_PARAMS_PER_CHANNEL] + # partition for parallel access along PE and N_PARAMS_PER_CHANNEL + # dimensions (dims 1 and 3) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.parameters " "complete dim=1") + ) + # set resource type + ram_style = self.get_nodeattr("ram_style") + pe = self.get_nodeattr("PE") + ich = self.get_nodeattr("NumChannels") + # if PE less than NumChannels, assign cores according to ram_style; + # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs + if pe < ich: + if ram_style == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.parameters " "core=ROM_2P_LUTRAM") + ) + elif ram_style == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.parameters " "core=ROM_2P_BRAM") + ) + else: + raise Exception( + """Invalid value for attribute ram_style! Is currently set to: {} + has to be set to one of ("block", "distributed")""".format( + ram_style + ) + ) diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py index 186a6af42c..af9628c644 100644 --- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py +++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx, Inc. +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,6 +46,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs): @@ -56,7 +58,7 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs): node_inp_list = ["inp", "const"] node = helper.make_node( - "ChannelwiseOp_Batch", + "ChannelwiseOp", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -68,6 +70,7 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs): outputDataType=odt.name, paramDataType=pdt.name, numInputVectors=vecs, + preferred_impl_style="hls", ) graph = helper.make_graph(nodes=[node], name="graph", inputs=[inp], outputs=[outp]) @@ -115,6 +118,7 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m odt = act model = make_modelwrapper(C, pe, idt, odt, pdt, func, vecs) + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -150,9 +154,9 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m if exec_mode == "rtlsim": hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "ChannelwiseOp_Batch_0" in hls_synt_res_est + assert "ChannelwiseOp_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("ChannelwiseOp_Batch")[0] + node = model.get_nodes_by_op_type("ChannelwiseOp_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) From 3bf09c95410427b52b32c85ffc45837eec0b9a61 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 29 Nov 2023 14:51:56 +0000 Subject: [PATCH 006/291] [CustomOp] Initial draft of DuplicateStreams in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 2 + .../fpgadataflow/duplicatestreams.py | 168 ++++++++++ .../custom_op/fpgadataflow/hls/__init__.py | 4 +- .../fpgadataflow/hls/duplicatestreams_hls.py | 316 ++++++++++++++++++ .../test_fpgadataflow_duplicatestreams.py | 18 +- 5 files changed, 501 insertions(+), 7 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/duplicatestreams.py create mode 100644 src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 8f5ff0ac92..305faaacf9 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -43,6 +43,7 @@ ConvolutionInputGenerator_rtl, ) from finn.custom_op.fpgadataflow.downsampler import DownSampler +from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise from finn.custom_op.fpgadataflow.fmpadding import FMPadding @@ -105,3 +106,4 @@ custom_op["FMPadding"] = FMPadding custom_op["AddStreams"] = AddStreams custom_op["ChannelwiseOp"] = ChannelwiseOp +custom_op["DuplicateStreams"] = DuplicateStreams diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams.py b/src/finn/custom_op/fpgadataflow/duplicatestreams.py new file mode 100644 index 0000000000..a4cf72df03 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams.py @@ -0,0 +1,168 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class DuplicateStreams(HWCustomOp): + """Abstraction layer for HW implementation of DuplicateStreams""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + "NumChannels": ("i", True, 0), + "PE": ("i", True, 0), + # how many duplicated output streams to create + "NumOutputStreams": ("i", True, 0), + # FINN DataTypes for input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_num_output_streams(self): + return self.get_nodeattr("NumOutputStreams") + + def get_normal_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ch]) + return ishape + + def get_folded_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + assert ch % pe == 0, "PE must divide NumChannels" + folds = int(ch / pe) + folded_ishape = tuple(vecs + [folds, pe]) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + # since the output shape of both out streams are the same + # return independently from index + return self.get_normal_input_shape() + + def get_folded_output_shape(self, ind=0): + # since the output shape of both out streams are the same + # return independently from index + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + num_out = self.get_num_output_streams() + assert len(self.onnx_node.output) == num_out, "Unexpected number of outputs" + + oshape = self.get_normal_output_shape() + ret = super().make_const_shape_op(oshape) + ret.output[:] = self.onnx_node.output + return ret + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + odt = self.get_output_datatype() + for my_out in self.onnx_node.output: + model.set_tensor_datatype(my_out, odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return self.get_num_output_streams() * np.prod(self.get_folded_output_shape()[1:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + pass + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + n_outputs = self.get_num_output_streams() + sname = self.hls_sname() + intf_names["m_axis"] = [] + for i in range(n_outputs): + intf_names["m_axis"].append( + ("out%d_%s" % (i, sname), self.get_outstream_width_padded()) + ) + return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out0": [], "out1": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index b5745c641d..450cf21f77 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -28,12 +28,14 @@ from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls +from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls custom_op = dict() # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure -custom_op["FMPadding_hls"] = FMPadding_hls custom_op["AddStreams_hls"] = AddStreams_hls +custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls custom_op["ChannelwiseOp_hls"] = ChannelwiseOp_hls +custom_op["FMPadding_hls"] = FMPadding_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py new file mode 100644 index 0000000000..4468ca152c --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py @@ -0,0 +1,316 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class DuplicateStreams_hls(DuplicateStreams, HLSBackend): + """Class that corresponds to finn-hlslib function of the same name.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(DuplicateStreams.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("NumOutputStreams") + self.get_nodeattr("inputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""") + + return info_messages + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def generate_params(self, model, path): + n_outputs = self.get_num_output_streams() + inp_streams = [] + commands = [] + o_stream_w = self.get_outstream_width() + i_stream_w = self.get_instream_width() + in_stream = "hls::stream > &in0" % (i_stream_w) + inp_streams.append(in_stream) + commands.append("ap_uint<%d> e = in0.read();" % i_stream_w) + iters = self.get_number_output_values() // self.get_num_output_streams() + for i in range(n_outputs): + out_stream = "hls::stream > &out%d" % (o_stream_w, i) + inp_streams.append(out_stream) + cmd = "out%d.write(e);" % i + commands.append(cmd) + + impl_hls_code = [] + impl_hls_code.append("void DuplicateStreamsCustom(") + impl_hls_code.append(",".join(inp_streams)) + impl_hls_code.append(") {") + impl_hls_code.append("for(unsigned int i = 0; i < %d; i++) {" % iters) + impl_hls_code.append("#pragma HLS PIPELINE II=1") + impl_hls_code.append("\n".join(commands)) + impl_hls_code.append("}") + impl_hls_code.append("}") + impl_hls_code = "\n".join(impl_hls_code) + + impl_filename = "{}/duplicate_impl.hpp".format(path) + f_impl = open(impl_filename, "w") + f_impl.write(impl_hls_code) + f_impl.close() + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + n_outputs = self.get_num_output_streams() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == exp_ishape, """Input shape doesn't match expected shape .""" + export_idt = self.get_input_datatype() + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_outputs(context, ["output%d.npy" % i for i in range(n_outputs)]) + for i in range(n_outputs): + assert ( + context[node.output[i]].shape == exp_oshape + ), "cppsim \ + did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {}, + } + for i in range(n_outputs): + rtlsim_dict["outputs"]["out%d" % i] = [] + self.rtlsim_multi_io(sim, rtlsim_dict) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_shape = self.get_folded_output_shape() + for i in range(n_outputs): + out_npy_path = "%s/output%d.npy" % (code_gen_dir, i) + rtlsim_output_to_npy( + rtlsim_dict["outputs"]["out%d" % i], + out_npy_path, + odt, + out_shape, + packed_bits, + target_bits, + ) + # load and reshape output 0 + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[i]] = output + + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output0 shape doesn't match expected shape.""" + assert ( + context[node.output[1]].shape == exp_oshape + ), """Output1 shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "duplicate_impl.hpp"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + n_outputs = self.get_num_output_streams() + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + for i in range(n_outputs): + out_name = "out%d_%s" % (i, self.hls_sname()) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> %s ("%s");' + % (self.get_outstream_width(), out_name, out_name) + ) + + def docompute(self): + n_outputs = self.get_num_output_streams() + ostreams = [] + for i in range(n_outputs): + ostreams.append("out%d_%s" % (i, self.hls_sname())) + dc = "DuplicateStreamsCustom(in0_%s, %s);" % ( + self.hls_sname(), + ",".join(ostreams), + ) + self.code_gen_dict["$DOCOMPUTE$"] = [dc] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + n_outputs = self.get_num_output_streams() + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + outstrm_code = [] + + for i in range(n_outputs): + out_name = "out%d_%s" % (i, self.hls_sname()) + npy_out = "%s/output%d.npy" % (code_gen_dir, i) + outstrm_code.append( + 'apintstream2npy<%s, %s, %d, %s>(%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + out_name, + oshape_cpp_str, + npy_out, + ) + ) + + self.code_gen_dict["$DATAOUTSTREAM$"] = outstrm_code + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + n_outputs = self.get_num_output_streams() + inp_streams = [] + o_stream_w = self.get_outstream_width() + i_stream_w = self.get_instream_width() + in_stream = "hls::stream > &in0_%s" % (i_stream_w, self.hls_sname()) + inp_streams.append(in_stream) + for i in range(n_outputs): + out_stream = "hls::stream > &out%d_%s" % ( + o_stream_w, + i, + self.hls_sname(), + ) + inp_streams.append(out_stream) + + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}({})""".format( + self.onnx_node.name, + ",".join(inp_streams), + ) + ] + + def pragmas(self): + n_outputs = self.get_num_output_streams() + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + for i in range(n_outputs): + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out%d_%s" % (i, self.hls_sname()) + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py index 27bab93fb6..ac96380da3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,9 +47,10 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl): +def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl, impl_style): shape = [1, idim, idim, ch] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) out_names = [] @@ -59,7 +61,7 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl): out_vi.append(helper.make_tensor_value_info(outp_name, TensorProto.FLOAT, shape)) dupstrm_node = helper.make_node( - "DuplicateStreams_Batch", + "DuplicateStreams", ["inp"], out_names, domain="finn.custom_op.fpgadataflow", @@ -69,6 +71,7 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl): PE=pe, inputDataType=idt.name, numInputVectors=[1, idim, idim], + preferred_impl_style=impl_style, ) graph = helper.make_graph(nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=out_vi) @@ -99,9 +102,11 @@ def prepare_inputs(input_tensor, idt): @pytest.mark.parametrize("n_dupl", [2, 3]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# impl_style +@pytest.mark.parametrize("impl_style", ["hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode): +def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode, impl_style): if fold == -1: pe = 1 else: @@ -111,7 +116,8 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode): # generate input data x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch)) - model = make_dupstreams_modelwrapper(ch, pe, imdim, idt, n_dupl) + model = make_dupstreams_modelwrapper(ch, pe, imdim, idt, n_dupl, impl_style) + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -136,7 +142,7 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode): assert (y == expected_y).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("DuplicateStreams_Batch")[0] + node = model.get_nodes_by_op_type("DuplicateStreams_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) From 43fa39747030b0a54d8d4e81067b5eea0a110a6c Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 29 Nov 2023 15:26:46 +0000 Subject: [PATCH 007/291] [CustomOp] Initial draft of GlobalAccPool in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 2 + .../custom_op/fpgadataflow/globalaccpool.py | 155 +++++++++++ .../custom_op/fpgadataflow/hls/__init__.py | 4 +- .../fpgadataflow/hls/globalaccpool_hls.py | 255 ++++++++++++++++++ .../test_fpgadataflow_globalaccpool.py | 18 +- 5 files changed, 427 insertions(+), 7 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/globalaccpool.py create mode 100644 src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 305faaacf9..b939ea0c56 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -49,6 +49,7 @@ from finn.custom_op.fpgadataflow.fmpadding import FMPadding from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl +from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch from finn.custom_op.fpgadataflow.iodma import IODMA from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch @@ -107,3 +108,4 @@ custom_op["AddStreams"] = AddStreams custom_op["ChannelwiseOp"] = ChannelwiseOp custom_op["DuplicateStreams"] = DuplicateStreams +custom_op["GlobalAccPool"] = GlobalAccPool diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool.py b/src/finn/custom_op/fpgadataflow/globalaccpool.py new file mode 100644 index 0000000000..c90385e9f0 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/globalaccpool.py @@ -0,0 +1,155 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class GlobalAccPool(HWCustomOp): + """Abstraction layer for HW implementation of GlobalAccPool""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + "NumChannels": ("i", True, 0), + "PE": ("i", True, 0), + # FINN DataTypes for input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ch]) + return ishape + + def get_folded_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + assert ch % pe == 0, "PE must divide NumChannels" + folds = int(ch / pe) + folded_ishape = tuple(vecs + [folds, pe]) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + if len(vecs) == 1: + oshape = tuple(vecs + [ch]) + elif len(vecs) == 3: + oshape = tuple([vecs[0]] + [1, 1, ch]) + return oshape + + def get_folded_output_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + unfolded_shape = list(self.get_normal_output_shape()) + assert ch % pe == 0, "PE must divide NumChannels" + folds = int(ch / pe) + oshape = tuple(unfolded_shape[:-1] + [folds, pe]) + return oshape + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + # determine data type from image size and input type + idt = DataType[self.get_nodeattr("inputDataType")] + vecs = list(self.get_nodeattr("numInputVectors")) + npixels = vecs[-1] * vecs[-2] + if idt.signed(): + extreme_value = npixels * idt.min() + else: + extreme_value = npixels * idt.max() + return DataType.get_smallest_possible(extreme_value) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[1:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * idim * idim + Channels/PE + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + folds = int(ch / pe) + return int(np.prod(self.get_folded_input_shape()[:-1]) + folds) + + def execute_node(self, context, graph): + pass diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 450cf21f77..075449d589 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -30,12 +30,14 @@ from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls +from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls custom_op = dict() # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure custom_op["AddStreams_hls"] = AddStreams_hls -custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls custom_op["ChannelwiseOp_hls"] = ChannelwiseOp_hls +custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls custom_op["FMPadding_hls"] = FMPadding_hls +custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py new file mode 100644 index 0000000000..4814c09e59 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py @@ -0,0 +1,255 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class GlobalAccPool_hls(GlobalAccPool, HLSBackend): + """Class that corresponds to finn-hlslib AccPool_Batch function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(GlobalAccPool.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""") + + # verify that input data is 2D + if len(self.get_nodeattr("numInputVectors")) != 3: + info_messages.append("""GlobalAccPool_Batch requires 2D data input.""") + raise Exception + + return info_messages + + def get_exp_cycles(self): + # Channels/PE * batch size * idim * idim + Channels/PE + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + folds = int(ch / pe) + return int(np.prod(self.get_folded_input_shape()[:-1]) + folds) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == exp_ishape, """Input shape doesn't match expected shape .""" + export_idt = self.get_input_datatype() + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim \ + did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [ + """AccPool_Batch<{}, {}, {}, {}, {}> (in0_{}, out_{}, 1);""".format( + self.get_normal_input_shape()[1], + self.get_nodeattr("NumChannels"), + self.get_input_datatype().get_hls_datatype_str(), + self.get_nodeattr("PE"), + self.get_output_datatype().get_hls_datatype_str(), + self.hls_sname(), + self.hls_sname(), + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{})""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py index 1b3d87c11f..a70db28c63 100644 --- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,14 +45,15 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_accpool_modelwrapper(ch, pe, idim, idt): +def make_accpool_modelwrapper(ch, pe, idim, idt, impl_style): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, idim, idim, ch]) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 1, 1, ch]) accpool_node = helper.make_node( - "GlobalAccPool_Batch", + "GlobalAccPool", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -60,6 +62,7 @@ def make_accpool_modelwrapper(ch, pe, idim, idt): PE=pe, inputDataType=idt.name, numInputVectors=[1, idim, idim], + preferred_impl_style=impl_style, ) graph = helper.make_graph(nodes=[accpool_node], name="graph", inputs=[inp], outputs=[outp]) @@ -85,9 +88,11 @@ def prepare_inputs(input_tensor, idt): @pytest.mark.parametrize("imdim", [7]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# impl_style +@pytest.mark.parametrize("impl_style", ["hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode): +def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode, impl_style): if fold == -1: pe = 1 else: @@ -97,7 +102,8 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode): # generate input data x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch)) - model = make_accpool_modelwrapper(ch, pe, imdim, idt) + model = make_accpool_modelwrapper(ch, pe, imdim, idt, impl_style) + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -120,7 +126,7 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode): assert (y == expected_y).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("GlobalAccPool_Batch")[0] + node = model.get_nodes_by_op_type("GlobalAccPool_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) From e1911670ab26e865677aaeae3c819e7da86f1107 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 29 Nov 2023 15:27:56 +0000 Subject: [PATCH 008/291] [CustomOp] Delete duplication of exp_cycles func --- src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py | 4 ---- .../custom_op/fpgadataflow/hls/duplicatestreams_hls.py | 4 ---- src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py | 7 ------- 3 files changed, 15 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py index d816b6f15a..e7c263c084 100644 --- a/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py @@ -117,10 +117,6 @@ def lut_estimation(self): # total cost return comparator_cost + lutram_cost - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - def get_template_param_values(self): """Returns the template parameter values according to input, output and weight data types.""" diff --git a/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py index 4468ca152c..de0fadb26c 100644 --- a/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py @@ -69,10 +69,6 @@ def verify_node(self): return info_messages - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - def generate_params(self, model, path): n_outputs = self.get_num_output_streams() inp_streams = [] diff --git a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py index 4814c09e59..93398b1dc9 100644 --- a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py @@ -73,13 +73,6 @@ def verify_node(self): return info_messages - def get_exp_cycles(self): - # Channels/PE * batch size * idim * idim + Channels/PE - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - folds = int(ch / pe) - return int(np.prod(self.get_folded_input_shape()[:-1]) + folds) - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node From 76da5ab3f990fcc36062dbfaf4862ee23b2aadb2 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 29 Nov 2023 16:45:57 +0000 Subject: [PATCH 009/291] [CustomOp] Initial draft of LabelSelect in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 2 + .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../fpgadataflow/hls/labelselect_hls.py | 262 ++++++++++++++++++ .../custom_op/fpgadataflow/labelselect.py | 146 ++++++++++ .../test_fpgadataflow_labelselect.py | 16 +- 5 files changed, 423 insertions(+), 5 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py create mode 100644 src/finn/custom_op/fpgadataflow/labelselect.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index b939ea0c56..f51acf7136 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -52,6 +52,7 @@ from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch from finn.custom_op.fpgadataflow.iodma import IODMA +from finn.custom_op.fpgadataflow.labelselect import LabelSelect from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch from finn.custom_op.fpgadataflow.lookup import Lookup from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation @@ -109,3 +110,4 @@ custom_op["ChannelwiseOp"] = ChannelwiseOp custom_op["DuplicateStreams"] = DuplicateStreams custom_op["GlobalAccPool"] = GlobalAccPool +custom_op["LabelSelect"] = LabelSelect diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 075449d589..66a5d7b53c 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -31,6 +31,7 @@ from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls +from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls custom_op = dict() @@ -41,3 +42,4 @@ custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls custom_op["FMPadding_hls"] = FMPadding_hls custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls +custom_op["LabelSelect_hls"] = LabelSelect_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py new file mode 100644 index 0000000000..701d061987 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py @@ -0,0 +1,262 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.labelselect import LabelSelect +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class LabelSelect_hls(LabelSelect, HLSBackend): + """Class that corresponds to finn-hlslib LabelSelect_Batch function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(LabelSelect.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("Labels") + self.get_nodeattr("PE") + self.get_nodeattr("K") + self.get_nodeattr("inputDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required LabelSelect_Batch attributes do not exist.""") + + # verify that input data is 1D + if len(self.get_nodeattr("numInputVectors")) > 1: + info_messages.append("""LabelSelect_Batch requires 1D data input.""") + raise Exception + + return info_messages + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == exp_ishape, """Input shape doesn't match expected shape .""" + export_idt = self.get_input_datatype() + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim \ + did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + # TopK ind output normally uses TensorProto.INT64, which + # can cause issues for the node-by-node simulation in FINN + # (as the custom DataType system always assumes float containers) + # so cast the output to int64 + ret = context[node.output[0]] + context[node.output[0]] = ret.astype(np.int64) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + + # Calling npy2apintstream with reverse_inner = false to have LE packing + # as required by HLS fxn LabelSelect_Batch + # Also notice that StreamingDataWidthConverter_Batch performs LE packing + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [ + """LabelSelect_Batch<{}, {}, {}, {}, {} > (in0_{}, out_{}, 1);""".format( + self.get_nodeattr("Labels"), + self.get_nodeattr("PE"), + self.get_nodeattr("K"), + self.get_input_datatype().get_hls_datatype_str(), + self.get_output_datatype().get_hls_datatype_str(), + self.hls_sname(), + self.hls_sname(), + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream > &out_{})""".format( + self.onnx_node.name, + self.get_nodeattr("PE"), + self.get_input_datatype().bitwidth(), + self.hls_sname(), + self.get_output_datatype().bitwidth(), + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/labelselect.py b/src/finn/custom_op/fpgadataflow/labelselect.py new file mode 100644 index 0000000000..77b50e0fc6 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/labelselect.py @@ -0,0 +1,146 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from qonnx.core.datatype import DataType +from qonnx.util.basic import roundup_to_integer_multiple + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class LabelSelect(HWCustomOp): + """Abstraction layer for HW implementation of LabelSelect""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + odt_name = self.get_nodeattr("outputDataType") + if odt_name == "": + # If not provided compute min size + labels = self.get_nodeattr("Labels") + odt = DataType.get_smallest_possible(labels - 1) + # ensure a datatype divisible by 8-bits in case this is the last node + bw = roundup_to_integer_multiple(odt.bitwidth(), 8) + new_odt_name = odt.name.replace(str(odt.bitwidth()), str(bw)) + odt = DataType[new_odt_name] + odt_name = odt.name + self.set_nodeattr("outputDataType", odt_name) + + def get_nodeattr_types(self): + my_attrs = { + "Labels": ("i", True, 0), + "PE": ("i", True, 0), + "K": ("i", True, 0), + # FINN DataTypes for input + "inputDataType": ("s", True, ""), + "outputDataType": ("s", False, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self, ind=0): + nlabels = self.get_nodeattr("Labels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [nlabels]) + return ishape + + def get_folded_input_shape(self, ind=0): + nlabels = self.get_nodeattr("Labels") + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + assert nlabels % pe == 0, "PE must divide Labels" + folds = int(nlabels / pe) + folded_ishape = tuple(vecs + [folds, pe]) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + k = self.get_nodeattr("K") + vecs = list(self.get_nodeattr("numInputVectors")) + oshape = tuple(vecs + [k]) + return oshape + + def get_folded_output_shape(self, ind=0): + k = self.get_nodeattr("K") + vecs = list(self.get_nodeattr("numInputVectors")) + oshape = tuple(vecs + [k, 1]) + return oshape + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # check input datatype against property + idt = model.get_tensor_datatype(node.input[0]) + self.set_nodeattr("inputDataType", idt.name) + + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + ret = DataType[self.get_nodeattr("inputDataType")] + return ret + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + ret = DataType[self.get_nodeattr("outputDataType")] + return ret + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + return self.get_output_datatype().bitwidth() + + def get_number_output_values(self): + return self.get_nodeattr("K") + + def execute_node(self, context, graph): + pass + + def get_exp_cycles(self): + nlabels = self.get_nodeattr("Labels") + pe = self.get_nodeattr("PE") + exp_cycles = nlabels / pe + return int(exp_cycles) diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py index efd093b0b3..244d8c8a54 100644 --- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py +++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -42,15 +43,16 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.test import soft_verify_topk -def make_labelselect_modelwrapper(labels, pe, k, idt): +def make_labelselect_modelwrapper(labels, pe, k, idt, impl_style): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, labels]) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, k]) labelselect_node = helper.make_node( - "LabelSelect_Batch", + "LabelSelect", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -59,6 +61,7 @@ def make_labelselect_modelwrapper(labels, pe, k, idt): PE=pe, K=k, inputDataType=idt.name, + preferred_impl_style=impl_style, ) graph = helper.make_graph( nodes=[labelselect_node], @@ -90,9 +93,11 @@ def prepare_inputs(input_tensor, idt): @pytest.mark.parametrize("k", [1, 5]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# impl style +@pytest.mark.parametrize("impl_style", ["hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode): +def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode, impl_style): np.random.seed(0) if fold == -1: pe = 1 @@ -106,7 +111,8 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode): # generate input data x = gen_finn_dt_tensor(idt, (1, labels)) - model = make_labelselect_modelwrapper(labels, pe, k, idt) + model = make_labelselect_modelwrapper(labels, pe, k, idt, impl_style) + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) From a53dd8573c5bac57a84be0f26170ffd03afacf6e Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 29 Nov 2023 18:02:53 +0000 Subject: [PATCH 010/291] [LabelSelect] Fix elem_type for make shape compatible for LabelSelect --- src/finn/custom_op/fpgadataflow/labelselect.py | 11 ++++++++++- tests/fpgadataflow/test_fpgadataflow_labelselect.py | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/labelselect.py b/src/finn/custom_op/fpgadataflow/labelselect.py index 77b50e0fc6..6b924034e4 100644 --- a/src/finn/custom_op/fpgadataflow/labelselect.py +++ b/src/finn/custom_op/fpgadataflow/labelselect.py @@ -26,6 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.util.basic import roundup_to_integer_multiple @@ -98,7 +99,15 @@ def make_shape_compatible_op(self, model): oshape = self.get_normal_output_shape() ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) assert ishape == exp_ishape, "Unexpected input shape." - return super().make_const_shape_op(oshape) + return helper.make_node( + "RandomNormal", + inputs=[], + outputs=[self.onnx_node.output[0]], + mean=0.0, + scale=1.0, + dtype=TensorProto.INT64, + shape=list(oshape), + ) def infer_node_datatype(self, model): node = self.onnx_node diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py index 244d8c8a54..d9c3f54e63 100644 --- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py +++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py @@ -49,7 +49,7 @@ def make_labelselect_modelwrapper(labels, pe, k, idt, impl_style): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, labels]) - outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, k]) + outp = helper.make_tensor_value_info("outp", TensorProto.INT64, [1, k]) labelselect_node = helper.make_node( "LabelSelect", From e42f4160cdb79c53260badb93560a09f8f827b62 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 29 Nov 2023 18:06:05 +0000 Subject: [PATCH 011/291] [Transform] Initial draft for conversion to hw layers and test case --- .../fpgadataflow/convert_to_hw_layers.py | 505 ++++++++++++++++++ .../test_convert_to_hw_layers_synthetic.py | 223 ++++++++ 2 files changed, 728 insertions(+) create mode 100644 src/finn/transformation/fpgadataflow/convert_to_hw_layers.py create mode 100644 tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py new file mode 100644 index 0000000000..e3813eb709 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -0,0 +1,505 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import numpy as np +import qonnx.core.data_layout as DataLayout +import warnings +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.transformation.base import Transformation +from qonnx.transformation.general import SortGraph +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.onnx import nchw_to_nhwc + + +class InferAddStreamsLayer(Transformation): + """Convert any Add into a AddStreams HW layer.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Add": + in0 = node.input[0] + in1 = node.input[1] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + in1_shape = model.get_tensor_shape(in1) + in0_static = not (model.get_initializer(in0) is None) + in1_static = not (model.get_initializer(in1) is None) + + # skip if different shapes on inputs + if in0_shape != in1_shape: + continue + # skip if any of inputs have initializers + # (this node is meant for adding two dynamic streams) + if in0_static or in1_static: + continue + + idt0 = model.get_tensor_datatype(in0) + idt1 = model.get_tensor_datatype(in1) + + # skip if different data types on inputs + if idt0 != idt1: + continue + + idt = idt0 + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # check layout and convert if necessary + in0_layout = model.get_tensor_layout(in0) + in1_layout = model.get_tensor_layout(in1) + result_layout = model.get_tensor_layout(result) + + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) + node_ind += 1 + in0_shape = model.get_tensor_shape(in0) + + if in1_layout == DataLayout.NCHW: + in1 = nchw_to_nhwc(in1, model, node_ind) + node_ind += 1 + in1_shape = model.get_tensor_shape(in1) + + # keep track of where we need to insert the HW Op + # it has to be ahead of the output transform + insert_point = node_ind + + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) + node_ind += 1 + + # now safe to assume num_channels is size of last dimension + num_channels = int(in0_shape[-1]) + # create node with no parallelization first + pe = 1 + + # create and insert new AddStreams node + new_node = helper.make_node( + "AddStreams", + [in0, in1], + [result], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=num_channels, + PE=pe, + inputDataType=idt.name, + numInputVectors=in0_shape[:-1], + name="AddStreams_" + node.name, + ) + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferDuplicateStreamsLayer(Transformation): + """Insert a DuplicateStreams HW layer for any tensor with fanout == 2""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + successors = model.find_consumers(node.output[0]) + if successors is not None and len(successors) >= 2: + output_tensor = node.output[0] + n_outputs = len(successors) + + dt = model.get_tensor_datatype(output_tensor) + + # skip conversion for layers with float input + if not dt.is_integer(): + continue + + # create clone tensors + out_shape = model.get_tensor_shape(output_tensor) + out_tensor_clones = [] + for i in range(n_outputs): + clone = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape + ) + model.graph.value_info.append(clone) + out_tensor_clones += [clone.name] + + num_ch = int(out_shape[-1]) + vecs = out_shape[:-1] + + # create node with no parallelization first + pe = 1 + + dup_node = helper.make_node( + "DuplicateStreams", + [output_tensor], + out_tensor_clones, + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=num_ch, + PE=pe, + inputDataType=dt.name, + numInputVectors=vecs, + NumOutputStreams=n_outputs, + outFIFODepths=[2] * n_outputs, + name="DuplicateStreams_" + node.name, + ) + + graph.node.insert(node_ind, dup_node) + + # connect successors to out tensor clone + clone_idx = 0 + for successor in successors: + for i, succ_input in enumerate(successor.input): + if succ_input == output_tensor: + successor.input[i] = out_tensor_clones[clone_idx] + clone_idx += 1 + # if one node has multiple connections to the same output + # find_direct_successors will return one node per input + # so break the inner loop will result in correct behaviour + break + + graph_modified = True + + if graph_modified: + model = model.transform(SortGraph()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferChannelwiseLinearLayer(Transformation): + """Convert any channel-wise Add/Mul into a HW layer.""" + + def get_smallest_possible(self, vals): + """Returns smallest (fewest bits) possible DataType that can represent + value. Prefers unsigned integers where possible.""" + vals = np.array(vals, dtype=np.float64) + for v in vals: + assert int(v) == v, "Error float value" + + for k in DataType.get_accumulator_dt_cands(): + dt = DataType[k] + + if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: + # not currently supported + continue + + if (dt.min() <= vals).all() and (vals <= dt.max()).all(): + return dt + + warnings.warn( + """InferChannelwiseLinearLayer: Output values may not be + representable with supported data types. + Setting maximum width data type available. + This will lead to errors if there are no constrains on the input + """ + ) + + if (0 <= vals).all(): + return DataType["UINT64"] + else: + return DataType["INT64"] + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Add" or node.op_type == "Mul": + # assuming input[0] is dynamic + ll_input = node.input[0] + ll_output = node.output[0] + ll_in_shape = model.get_tensor_shape(ll_input) + + # check if input 1 has an initializer + ll_const = node.input[1] + if ll_const is not None: + ll_cinit = model.get_initializer(ll_const) + if ll_cinit is None: + # input 1 is also dynamic + continue + else: + continue + + # get number of channels and channel index from input + ll_in_layout = model.get_tensor_layout(ll_input) + if ll_in_layout == DataLayout.NHWC or ll_in_layout == DataLayout.NC: + ch_index = -1 + ch = ll_in_shape[-1] + elif ll_in_layout == DataLayout.NCHW: + ch_index = 1 + ch = ll_in_shape[1] + else: + continue + + # check if the shape of initializer is compatible + ll_cinit_shape = list(ll_cinit.shape) + if np.prod(ll_cinit_shape) == 1: + warnings.warn("Broadcasting " + str(node.op_type) + "(" + node.name + ")") + ll_cinit = np.full((ch), ll_cinit.flatten()[0]) + elif np.prod(ll_cinit_shape) != ch or ll_cinit_shape[ch_index] != ch: + # parameter shape not compatible with Channelwise + continue + + # check initializer contains integers as floats + if not (ll_cinit.astype(np.int32) == ll_cinit).all(): + continue + # all initializer conditions are met + + # check inputs + idt = model.get_tensor_datatype(ll_input) + if not idt.is_integer(): + # skip conversion for layers with float input + continue + + # check layout of inputs/outputs, and convert if needed + # check layout and convert if necessary + if ll_in_layout == DataLayout.NCHW: + ll_input = nchw_to_nhwc(ll_input, model, node_ind) + node_ind += 1 + ll_in_shape = model.get_tensor_shape(ll_input) + + # keep track of where we need to insert the HW Op + # it has to be ahead of the output transform + insert_point = node_ind + ll_output_layout = model.get_tensor_layout(ll_output) + if ll_output_layout == DataLayout.NCHW: + ll_output = nchw_to_nhwc(ll_output, model, node_ind, reverse=True) + node_ind += 1 + + # get parameter data type + param_min = min(ll_cinit.flatten()) + param_max = max(ll_cinit.flatten()) + pdt = self.get_smallest_possible([param_min, param_max]) + + # set function and determine output data type + if node.op_type == "Add": + func = "add" + out_min = idt.min() + param_min + out_max = idt.max() + param_max + odt = self.get_smallest_possible([out_min, out_max]) + elif node.op_type == "Mul": + func = "mul" + possible_limits = [] + possible_limits += [idt.min() * param_min] + possible_limits += [idt.min() * param_max] + possible_limits += [idt.max() * param_min] + possible_limits += [idt.max() * param_max] + odt = self.get_smallest_possible(possible_limits) + + model.set_initializer(ll_const, ll_cinit.reshape(ch)) + model.set_tensor_datatype(ll_output, odt) + + # create node with no parallelization first + pe = 1 + assert ch % pe == 0, "Requirement IFC divisable by PE is violated." + # create and insert node + new_node = helper.make_node( + "ChannelwiseOp", + [ll_input, ll_const], + [ll_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + Func=func, + NumChannels=ch, + PE=pe, + inputDataType=idt.name, + paramDataType=pdt.name, + outputDataType=odt.name, + numInputVectors=list(ll_in_shape[:-1]), + name="ChannelwiseOp_" + node.name, + ) + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferLabelSelectLayer(Transformation): + """Convert any TopK into a LabelSelect HW layer.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "TopK": + fc_input = node.input[0] + k_input = node.input[1] + val_output = node.output[0] + idx_output = node.output[1] + fc_in_shape = model.get_tensor_shape(fc_input) + + idt = model.get_tensor_datatype(fc_input) + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # skip conversion for if value output is connected (not supported) + if model.find_consumer(val_output) is not None: + continue + + num_labels = int(fc_in_shape[-1]) + num_inp_vecs = list(fc_in_shape[:-1]) + # create node with no parallelization first + pe = 1 + + k = model.get_initializer(k_input)[0] + + # create and insert new LabelSelect node + new_node = helper.make_node( + "LabelSelect", + [fc_input], + [idx_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + Labels=num_labels, + PE=pe, + K=k, + inputDataType=idt.name, + numInputVectors=num_inp_vecs, + name="LabelSelect_" + node.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferGlobalAccPoolLayer(Transformation): + """Convert any GlobalAveragePool into a GlobalAccPool HW layer and a scalar Mul.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "GlobalAveragePool": + in0 = node.input[0] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + + idt = model.get_tensor_datatype(in0) + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # check layout and convert if necessary + in0_layout = model.get_tensor_layout(in0) + result_layout = model.get_tensor_layout(result) + + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) + node_ind += 1 + in0_shape = model.get_tensor_shape(in0) + + # keep track of where we need to insert the HW Op + # it has to be ahead of the output transform + insert_point = node_ind + + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) + node_ind += 1 + + num_ch = int(in0_shape[-1]) + vecs = in0_shape[:-1] + # create node with no parallelization first + pe = 1 + + # create an additional tensor of the same shape and layout as result + out_shape = model.get_tensor_shape(result) + pool_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape + ) + model.graph.value_info.append(pool_out) + pool_out = pool_out.name + model.set_tensor_layout(pool_out, model.get_tensor_layout(result)) + + new_pool = helper.make_node( + "GlobalAccPool", + [in0], + [pool_out], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=num_ch, + PE=pe, + inputDataType=idt.name, + numInputVectors=vecs, + name="GlobalAccPool_" + node.name, + ) + + mul_value = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, [1] + ) + model.graph.value_info.append(mul_value) + model.set_initializer( + mul_value.name, np.array(1 / (vecs[1] * vecs[2]), dtype=np.float32) + ) + new_mul = helper.make_node( + "Mul", + [pool_out, mul_value.name], + [result], + ) + graph.node.insert(insert_point, new_pool) + graph.node.insert(insert_point + 1, new_mul) + node_ind += 1 + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py new file mode 100644 index 0000000000..be8bce7fc3 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py @@ -0,0 +1,223 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import os +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.fold_constants import FoldConstants +from qonnx.transformation.general import ( + GiveReadableTensorNames, + GiveUniqueNodeNames, + SortGraph, +) +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.insert_topk import InsertTopK +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model + +import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.transformation.streamline.absorb import ( + AbsorbConsecutiveTransposes, + AbsorbScalarMulAddIntoTopK, +) +from finn.transformation.streamline.collapse_repeated import ( + CollapseRepeatedAdd, + CollapseRepeatedMul, +) +from finn.transformation.streamline.reorder import ( + MoveAddPastMul, + MoveScalarLinearPastInvariants, +) +from finn.util.test import soft_verify_topk + +export_onnx_path = "test_output_synthetic.onnx" + +# construct a synthetic graph to test: +# topk insertion, topk conversion to hls, add conversion to hls +# graph should just be a sum + + +def make_model(ch, ifmdim): + shape = [1, ch, ifmdim, ifmdim] + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) + inp1_add0_ct = helper.make_tensor_value_info("inp1_add0_ct", TensorProto.FLOAT, [1]) + inp1_add = helper.make_tensor_value_info("inp1_add", TensorProto.FLOAT, shape) + inp1_add_ct = helper.make_tensor_value_info("inp1_add_ct", TensorProto.FLOAT, [1]) + inp2_add = helper.make_tensor_value_info("inp2_add", TensorProto.FLOAT, shape) + inp2_add_ct = helper.make_tensor_value_info("inp2_add_ct", TensorProto.FLOAT, [1]) + inp1_mul = helper.make_tensor_value_info("inp1_mul", TensorProto.FLOAT, shape) + inp1_mul_ct = helper.make_tensor_value_info("inp1_mul_ct", TensorProto.FLOAT, [1]) + inp2_mul = helper.make_tensor_value_info("inp2_mul", TensorProto.FLOAT, shape) + inp2_mul_ct = helper.make_tensor_value_info("inp2_mul_ct", TensorProto.FLOAT, [1]) + eltwise_add = helper.make_tensor_value_info("eltwise_add", TensorProto.FLOAT, shape) + pool = helper.make_tensor_value_info("pool", TensorProto.FLOAT, [1, ch, 1, 1]) + reshape_ct = helper.make_tensor_value_info("reshape_ct", TensorProto.INT64, [2]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch]) + + add0_node = helper.make_node("Add", [inp.name, inp1_add0_ct.name], ["out_add0"]) + add1_node = helper.make_node("Add", ["out_add0", inp1_add_ct.name], [inp1_add.name]) + add2_node = helper.make_node("Add", ["out_add0", inp2_add_ct.name], [inp2_add.name]) + mul1_node = helper.make_node("Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name]) + mul2_node = helper.make_node("Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name]) + eltwise_add_node = helper.make_node("Add", [inp1_mul.name, inp2_mul.name], [eltwise_add.name]) + globalavgpool_node = helper.make_node("GlobalAveragePool", [eltwise_add.name], [pool.name]) + reshape_node = helper.make_node("Reshape", [pool.name, reshape_ct.name], [outp.name]) + + graph = helper.make_graph( + nodes=[ + add0_node, + add1_node, + add2_node, + mul1_node, + mul2_node, + eltwise_add_node, + globalavgpool_node, + reshape_node, + ], + name="graph", + inputs=[inp], + outputs=[outp], + ) + + model = qonnx_make_model(graph, producer_name="add-model") + model = ModelWrapper(model) + + # set initializers for scalar add/mul nodes + model.set_initializer(add0_node.input[1], np.array([0.0], dtype=np.float32)) + model.set_initializer(add1_node.input[1], np.array([7.0], dtype=np.float32)) + model.set_initializer(add2_node.input[1], np.array([8.0], dtype=np.float32)) + model.set_initializer(mul1_node.input[1], np.array([2.0], dtype=np.float32)) + model.set_initializer(mul2_node.input[1], np.array([2.0], dtype=np.float32)) + model.set_initializer(reshape_node.input[1], np.array([1, -1], dtype=np.int64)) + + return model + + +# data types +@pytest.mark.parametrize("idt", [DataType["UINT2"]]) +# channels +@pytest.mark.parametrize("ch", [16]) +# ifmdim +@pytest.mark.parametrize("ifmdim", [5]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): + model = make_model(ch, ifmdim) + model.save(export_onnx_path) + model = ModelWrapper(export_onnx_path, fix_float64=True) + model = model.transform(InferShapes()) + model = model.transform(FoldConstants()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model = model.transform(InferDataLayouts()) + # generate test vectors of correct shape + if ifmdim == -1: + input_tensor_shape = (1, ch) + else: + input_tensor_shape = (1, ch, ifmdim, ifmdim) + + x = gen_finn_dt_tensor(idt, input_tensor_shape) + + # generate expected value from streamlined net + input_dict = {model.graph.input[0].name: x} + + output_dict = oxe.execute_onnx(model, input_dict, True) + produced_sum = output_dict[model.graph.output[0].name] + chw_mul = model.get_initializer(model.graph.node[-1].input[1]) + chw_mul = 1 + expected_sum = chw_mul * np.sum(2 * (2 * x + 15.0), axis=(2, 3)) / (ifmdim * ifmdim) + assert (produced_sum.flatten() == expected_sum.flatten()).all() + + model = model.transform(InferDataLayouts()) + + # convert to hls + model.set_tensor_datatype(model.graph.input[0].name, idt) + # extra streamlining + model = model.transform(MoveScalarLinearPastInvariants()) + model = model.transform(MoveAddPastMul()) + model = model.transform(CollapseRepeatedMul()) + model = model.transform(CollapseRepeatedAdd()) + # insert top-k node, which should absorb linear ops before it + + model = model.transform(InferShapes()) + model = model.transform(InferDataLayouts()) + model = model.transform(InferDataTypes()) + + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferAddStreamsLayer()) + model = model.transform(to_hw.InferGlobalAccPoolLayer()) + model = model.transform(MoveScalarLinearPastInvariants()) + model = model.transform(InsertTopK()) + model = model.transform(AbsorbScalarMulAddIntoTopK()) + model = model.transform(InferDataTypes()) + model = model.transform(to_hw.InferLabelSelectLayer()) + model = model.transform(AbsorbConsecutiveTransposes()) + model = model.transform(InferDataTypes()) + # model = model.transform(to_hw.InferLabelSelectLayer()) + model = model.transform(to_hw.InferDuplicateStreamsLayer()) + + model = model.transform(SortGraph()) + + # check topology status + + finn_nodes = model.get_finn_nodes() + assert len(finn_nodes) == 9 + add_nodes = model.get_nodes_by_op_type("AddStreams") + assert len(add_nodes) == 1 + pool_nodes = model.get_nodes_by_op_type("GlobalAccPool") + assert len(pool_nodes) == 1 + label_nodes = model.get_nodes_by_op_type("LabelSelect") + assert len(label_nodes) == 1 + channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp") + assert len(channelwise_nodes) == 5 + dup_nodes = model.get_nodes_by_op_type("DuplicateStreams") + assert len(dup_nodes) == 1 + + model = model.transform(SpecializeLayers()) + + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + + output_dict = oxe.execute_onnx(model, input_dict, True) + produced_topk_hls = output_dict[model.graph.output[0].name] + topk_input = output_dict[model.graph.node[-1].input[0]] + assert soft_verify_topk(topk_input, produced_topk_hls, 5) + + os.remove(export_onnx_path) From c854276ca282563ac4cc7216b878ad8a77e4f7ad Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 30 Nov 2023 12:17:37 +0000 Subject: [PATCH 012/291] [AddStreams] Add execution for hw abstraction layer --- src/finn/custom_op/fpgadataflow/addstreams.py | 11 ++++++++++- .../test_fpgadataflow_addstreams.py | 19 +++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/addstreams.py b/src/finn/custom_op/fpgadataflow/addstreams.py index 0f1336c6e1..ac61786ac1 100644 --- a/src/finn/custom_op/fpgadataflow/addstreams.py +++ b/src/finn/custom_op/fpgadataflow/addstreams.py @@ -141,7 +141,16 @@ def get_exp_cycles(self): return np.prod(self.get_folded_output_shape()[:-1]) def execute_node(self, context, graph): - pass + # simulate behavior using Python + node = self.onnx_node + inp0_values = context[node.input[0]] + inp1_values = context[node.input[1]] + oshape = context[node.output[0]].shape + ishape0 = inp0_values.shape + ishape1 = inp1_values.shape + assert ishape0 == ishape1, "Shapes of inputs should be the same for Addstreams" + result = inp0_values + inp1_values + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py index ba3afe9c86..530d94e13b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py @@ -105,9 +105,18 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): x2 = gen_finn_dt_tensor(idt, (1, ch)) model = make_addstreams_modelwrapper(ch, pe, idt) - model.save("addstreams_hw.onnx") + + # prepare input data + input_dict = prepare_inputs(x1, x2) + oshape = model.get_tensor_shape("outp") + y = x1 + x2 + y_expected = y.reshape(oshape) + + # test verification flow before specializing layer + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all(), "Execution of hw layer failed" + model = model.transform(SpecializeLayers()) - model.save("addstreams_hls.onnx") if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -122,12 +131,6 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): else: raise Exception("Unknown exec_mode") - # prepare input data - input_dict = prepare_inputs(x1, x2) - - oshape = model.get_tensor_shape("outp") - y = x1 + x2 - y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) From 445360d1b0d3d8030bc6fdf492b8c7de47ddada6 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 30 Nov 2023 12:25:59 +0000 Subject: [PATCH 013/291] [ChannelwiseOp] Add execution for hw abstraction layer --- .../custom_op/fpgadataflow/channelwise_op.py | 36 +++++++++++++++++- .../test_fpgadataflow_channelwise_ops.py | 37 ++++++++++++------- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op.py b/src/finn/custom_op/fpgadataflow/channelwise_op.py index 5d1d8febc1..9bf4ebdf62 100644 --- a/src/finn/custom_op/fpgadataflow/channelwise_op.py +++ b/src/finn/custom_op/fpgadataflow/channelwise_op.py @@ -27,8 +27,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np +import onnxruntime as rt import warnings +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.util.basic import qonnx_make_model from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp @@ -197,4 +200,35 @@ def get_exp_cycles(self): return np.prod(self.get_folded_output_shape()[:-1]) def execute_node(self, context, graph): - pass + # create a standard onnx node to help calculate the result + # depending on Func node attribute either a Mul or an Add node + node = self.onnx_node + func = self.get_nodeattr("Func") + inp_values = context[node.input[0]] + param_values = context[node.input[1]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + pshape = param_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + param = helper.make_tensor_value_info(node.input[1], TensorProto.FLOAT, pshape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + node_func = helper.make_node( + func.capitalize(), + inputs=node.input, + outputs=[node.output[0]], + ) + graph_func = helper.make_graph( + nodes=[node_func], + name="single-add-exec", + inputs=[inp, param], + outputs=[outp], + ) + + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_func = qonnx_make_model(graph_func, **onnx_kwargs) + idict = {node.input[0]: inp_values, node.input[1]: param_values} + sess = rt.InferenceSession(model_func.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py index af9628c644..d5fa7c779f 100644 --- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py +++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py @@ -112,12 +112,33 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m # generate input and param data x = gen_finn_dt_tensor(idt, tuple(vecs + [ich])) - # C = np.random.randint(idt.min(), idt.max() + 1, ich).astype(np.float32) C = gen_finn_dt_tensor(pdt, (ich)) odt = act + # create model model = make_modelwrapper(C, pe, idt, odt, pdt, func, vecs) + + # package input data as dictionary + input_dict = {"inp": x} + + oshape = model.get_tensor_shape("outp") + + C_reshaped = np.broadcast_to(C.flatten(), x.shape) + if func == "add": + y = x + C_reshaped + elif func == "mul": + y = x * C_reshaped + + y_expected = y.reshape(oshape) + + # verify hw abstraction layer + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all(), "HW layer execution failed" + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": @@ -133,24 +154,12 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m else: raise Exception("Unknown exec_mode") - # package input data as dictionary - input_dict = {"inp": x} - - oshape = model.get_tensor_shape("outp") - - C_reshaped = np.broadcast_to(C.flatten(), x.shape) - if func == "add": - y = x + C_reshaped - elif func == "mul": - y = x * C_reshaped - - y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all(), exec_mode + " failed" if exec_mode == "rtlsim": hls_synt_res_est = model.analysis(hls_synth_res_estimation) From 67f922670f9693d3851b913357ab2ce83b448e4c Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 30 Nov 2023 13:55:49 +0000 Subject: [PATCH 014/291] [DuplicateStreams] Add execution for hw abstraction layer --- .../custom_op/fpgadataflow/duplicatestreams.py | 11 ++++++++++- .../test_fpgadataflow_duplicatestreams.py | 14 +++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams.py b/src/finn/custom_op/fpgadataflow/duplicatestreams.py index a4cf72df03..8943ffc9e3 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams.py +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams.py @@ -144,7 +144,16 @@ def get_exp_cycles(self): return np.prod(self.get_folded_output_shape()[:-1]) def execute_node(self, context, graph): - pass + # passing input to both outputs to make + # abstraction layer executable + node = self.onnx_node + inp = context[node.input[0]] + exp_shape = self.get_normal_input_shape() + + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + for outp in node.output: + context[outp] = output def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py index ac96380da3..62b9265466 100644 --- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py @@ -117,6 +117,17 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode, x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch)) model = make_dupstreams_modelwrapper(ch, pe, imdim, idt, n_dupl, impl_style) + + # prepare input data and execute + input_dict = prepare_inputs(x, idt) + + # check behavior of hw abstraction layer + output_dict = oxe.execute_onnx(model, input_dict) + expected_y = x + for i in range(n_dupl): + y = output_dict["outp%d" % i] + assert (y == expected_y).all(), "HW layer execution failed" + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": @@ -132,11 +143,8 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode, else: raise Exception("Unknown exec_mode") - # prepare input data and execute - input_dict = prepare_inputs(x, idt) output_dict = oxe.execute_onnx(model, input_dict) - expected_y = x for i in range(n_dupl): y = output_dict["outp%d" % i] assert (y == expected_y).all(), exec_mode + " failed" From 95b1ec5c21cb3289e99e165e49e2a3676c936604 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 30 Nov 2023 14:01:26 +0000 Subject: [PATCH 015/291] [GlobalAccPool] Add execution for hw abstraction layer --- src/finn/custom_op/fpgadataflow/globalaccpool.py | 7 ++++++- .../fpgadataflow/test_fpgadataflow_globalaccpool.py | 12 +++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool.py b/src/finn/custom_op/fpgadataflow/globalaccpool.py index c90385e9f0..4008cdc7c9 100644 --- a/src/finn/custom_op/fpgadataflow/globalaccpool.py +++ b/src/finn/custom_op/fpgadataflow/globalaccpool.py @@ -152,4 +152,9 @@ def get_exp_cycles(self): return int(np.prod(self.get_folded_input_shape()[:-1]) + folds) def execute_node(self, context, graph): - pass + # simulate behavior with Python functionality + node = self.onnx_node + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + result = np.apply_over_axes(np.sum, inp_values, [1, 2]) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py index a70db28c63..9c2802aade 100644 --- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py @@ -102,7 +102,16 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode, impl_style) # generate input data x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch)) + # prepare input data and execute + input_dict = prepare_inputs(x, idt) + expected_y = np.sum(x, axis=(1, 2)).flatten() + model = make_accpool_modelwrapper(ch, pe, imdim, idt, impl_style) + + y = oxe.execute_onnx(model, input_dict)["outp"] + + assert (y == expected_y).all(), "HW layer verification failed" + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": @@ -118,10 +127,7 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode, impl_style) else: raise Exception("Unknown exec_mode") - # prepare input data and execute - input_dict = prepare_inputs(x, idt) y = oxe.execute_onnx(model, input_dict)["outp"] - expected_y = np.sum(x, axis=(1, 2)).flatten() assert (y == expected_y).all(), exec_mode + " failed" From 05d3dbfab4c1e48b44543d39eb46d05dcccd8814 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 30 Nov 2023 14:05:01 +0000 Subject: [PATCH 016/291] [LabelSelect] Add execution for hw abstraction layer --- .../custom_op/fpgadataflow/labelselect.py | 37 +++++++++++++++++-- .../test_fpgadataflow_labelselect.py | 8 +++- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/labelselect.py b/src/finn/custom_op/fpgadataflow/labelselect.py index 6b924034e4..f4b098cff7 100644 --- a/src/finn/custom_op/fpgadataflow/labelselect.py +++ b/src/finn/custom_op/fpgadataflow/labelselect.py @@ -25,10 +25,11 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - +import numpy as np +import onnxruntime as rt from onnx import TensorProto, helper from qonnx.core.datatype import DataType -from qonnx.util.basic import roundup_to_integer_multiple +from qonnx.util.basic import qonnx_make_model, roundup_to_integer_multiple from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp @@ -146,7 +147,37 @@ def get_number_output_values(self): return self.get_nodeattr("K") def execute_node(self, context, graph): - pass + # create a standard add node to help calculate the result + node = self.onnx_node + k = self.get_nodeattr("K") + + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + k_inp = helper.make_tensor_value_info("k_inp", TensorProto.INT64, [1]) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.INT64, oshape) + val_outp = helper.make_tensor_value_info("val_outp", TensorProto.FLOAT, oshape) + node_topk = helper.make_node( + "TopK", + inputs=[node.input[0], "k_inp"], + outputs=["val_outp", node.output[0]], + ) + graph_topk = helper.make_graph( + nodes=[node_topk], + name="single-add-exec", + inputs=[inp, k_inp], + outputs=[val_outp, outp], + ) + + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_topk = qonnx_make_model(graph_topk, **onnx_kwargs) + idict = {node.input[0]: inp_values, "k_inp": [k]} + sess = rt.InferenceSession(model_topk.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result[1], dtype=np.float32).reshape(oshape) def get_exp_cycles(self): nlabels = self.get_nodeattr("Labels") diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py index d9c3f54e63..98ded66ca7 100644 --- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py +++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py @@ -110,8 +110,14 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode, impl_style): # generate input data x = gen_finn_dt_tensor(idt, (1, labels)) + input_dict = prepare_inputs(x, idt) model = make_labelselect_modelwrapper(labels, pe, k, idt, impl_style) + + y = oxe.execute_onnx(model, input_dict)["outp"] + + assert soft_verify_topk(x, y, k), "HW layer execution failed" + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": @@ -127,8 +133,6 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode, impl_style): else: raise Exception("Unknown exec_mode") - # prepare input data and execute - input_dict = prepare_inputs(x, idt) y = oxe.execute_onnx(model, input_dict)["outp"] assert soft_verify_topk(x, y, k), exec_mode + " failed" From 55449a8f5356757ffbb901980eebebea042efa59 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 30 Nov 2023 14:10:09 +0000 Subject: [PATCH 017/291] [Tests] Expand conversion to hw layers tests by functional verification --- .../test_convert_to_hw_layers_synthetic.py | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py index be8bce7fc3..02a53485ad 100644 --- a/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py @@ -189,7 +189,6 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = model.transform(to_hw.InferLabelSelectLayer()) model = model.transform(AbsorbConsecutiveTransposes()) model = model.transform(InferDataTypes()) - # model = model.transform(to_hw.InferLabelSelectLayer()) model = model.transform(to_hw.InferDuplicateStreamsLayer()) model = model.transform(SortGraph()) @@ -209,14 +208,37 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): dup_nodes = model.get_nodes_by_op_type("DuplicateStreams") assert len(dup_nodes) == 1 + output_hw = oxe.execute_onnx(model, input_dict, True) + model = model.transform(SpecializeLayers()) + # check topology status + + finn_nodes = model.get_finn_nodes() + assert len(finn_nodes) == 9 + add_nodes = model.get_nodes_by_op_type("AddStreams_hls") + assert len(add_nodes) == 1 + pool_nodes = model.get_nodes_by_op_type("GlobalAccPool_hls") + assert len(pool_nodes) == 1 + label_nodes = model.get_nodes_by_op_type("LabelSelect_hls") + assert len(label_nodes) == 1 + channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp_hls") + assert len(channelwise_nodes) == 5 + dup_nodes = model.get_nodes_by_op_type("DuplicateStreams_hls") + assert len(dup_nodes) == 1 + model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) model = model.transform(SetExecMode("cppsim")) output_dict = oxe.execute_onnx(model, input_dict, True) - produced_topk_hls = output_dict[model.graph.output[0].name] + + # verify execution + outp_name = model.graph.output[0].name + # comparison before and after layer specialization + assert (output_dict[outp_name] == output_hw[outp_name]).all() + # comparison with golden output + produced_topk_hls = output_dict[outp_name] topk_input = output_dict[model.graph.node[-1].input[0]] assert soft_verify_topk(topk_input, produced_topk_hls, 5) From 9eb113f5e7f4f8246fe54ddf4a9844e4b0571c3e Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 30 Nov 2023 14:46:38 +0000 Subject: [PATCH 018/291] [FMPadding] Add execution for hw abstraction layer --- src/finn/custom_op/fpgadataflow/fmpadding.py | 10 +++++++++- tests/fpgadataflow/test_fpgadataflow_fmpadding.py | 14 ++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding.py index 0324984c3f..5767028ea7 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding.py @@ -161,4 +161,12 @@ def get_number_output_values(self): return np.prod(folded_oshape[:-1]) def execute_node(self, context, graph): - pass + # simulate behavior with Python functionality + node = self.onnx_node + pad = self.get_nodeattr("Padding") + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + result = np.pad( + inp_values, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant" + ) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 3717f92e5d..12c84e7221 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -128,8 +128,17 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): odim_h = idim_h + pad_h odim_w = idim_w + pad_w + y_expected = np.pad(x, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant") + expected_oshape = (1, odim_h, odim_w, num_ch) + model = make_single_fmpadding_modelwrapper(impl_style, idim, pad, num_ch, simd, idt) + + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert y_produced.shape == expected_oshape + assert (y_produced == y_expected).all(), "HW layer execution failed" + model = model.transform(SpecializeLayers()) + model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) @@ -142,11 +151,8 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): model = model.transform(PrepareRTLSim()) y_produced = oxe.execute_onnx(model, input_dict)["outp"] - expected_oshape = (1, odim_h, odim_w, num_ch) - assert y_produced.shape == expected_oshape - - y_expected = np.pad(x, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant") + assert y_produced.shape == expected_oshape assert (y_produced == y_expected).all() if mode == "rtlsim": From 0b78eef851839965294228cd975d496f43868b27 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 30 Nov 2023 15:27:06 +0000 Subject: [PATCH 019/291] [Tests] Add hw conversion test for channelwise layer --- .../test_convert_to_hw_channelwise_layer.py | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py diff --git a/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py new file mode 100644 index 0000000000..4b063f8505 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py @@ -0,0 +1,143 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_data_layouts import InferDataLayouts +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model + +import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +def make_single_channelwise_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape): + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, ishape) + p0 = helper.make_tensor_value_info("p0", TensorProto.FLOAT, pshape) + + model = qonnx_make_model( + helper.make_graph( + name="test", + inputs=[inp], + outputs=[outp], + value_info=[p0], + nodes=[helper.make_node(onnx_op_name, ["inp", "p0"], ["outp"])], + ) + ) + + model = ModelWrapper(model) + model.set_initializer("p0", gen_finn_dt_tensor(pdt, pshape)) + model.set_tensor_datatype("inp", idt) + model.transform(InferDataLayouts(), make_deepcopy=False) + model.transform(InferShapes(), make_deepcopy=False) + return model + + +# parameter datatype +@pytest.mark.parametrize("pdt", [DataType["BIPOLAR"], DataType["UINT4"], DataType["INT2"]]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT32"], DataType["UINT4"], DataType["INT4"]]) +# function +@pytest.mark.parametrize("onnx_op_name", ["Add", "Mul"]) +# vector parameter or scalar parameter (broadcast) +@pytest.mark.parametrize("scalar_param", [True, False]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_convert_to_hw_channelwise_layer(pdt, idt, onnx_op_name, scalar_param, exec_mode): + ifm_ch = 16 + ifm_dim = 5 + ishape = (1, ifm_ch, ifm_dim, ifm_dim) + if scalar_param: + pshape = (1,) + else: + pshape = (1, ifm_ch, 1, 1) + + np.random.seed(0) + model = make_single_channelwise_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape) + + # Since the aren't Data types with a bit width of a non power of 2, + # there are cases where the input won't use it full range. + if idt == DataType["INT32"]: + x = gen_finn_dt_tensor(DataType["INT16"], (1, ifm_ch, ifm_dim, ifm_dim)) + elif idt == DataType["UINT32"]: + x = gen_finn_dt_tensor(DataType["UINT16"], (1, ifm_ch, ifm_dim, ifm_dim)) + else: + x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim)) + + input_dict = prepare_inputs(x) + y_expected = oxe.execute_onnx(model, input_dict)["outp"] + + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(GiveUniqueNodeNames()) + + ctx_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) + y_produced = ctx_produced["outp"] + + assert (y_produced == y_expected).all() + assert model.graph.node[1].op_type == "ChannelwiseOp" + + model = model.transform(SpecializeLayers()) + + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode") + + ctx_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) + y_produced = ctx_produced["outp"] + + assert (y_produced == y_expected).all() + assert model.graph.node[1].op_type == "ChannelwiseOp_hls" From 30fc1aef45e0c13c7a8e600f38d12ecd84e3bd28 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 30 Nov 2023 17:46:27 +0000 Subject: [PATCH 020/291] [CustomOp] Initial draft of streamingmaxpool in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 2 + .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../fpgadataflow/hls/streamingmaxpool_hls.py | 300 ++++++++++++++++++ .../fpgadataflow/streamingmaxpool.py | 229 +++++++++++++ .../fpgadataflow/convert_to_hw_layers.py | 56 ++++ .../test_fpgadataflow_streamingmaxpool.py | 18 +- 6 files changed, 602 insertions(+), 5 deletions(-) create mode 100755 src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py create mode 100755 src/finn/custom_op/fpgadataflow/streamingmaxpool.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index f51acf7136..0a92b99fd4 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -67,6 +67,7 @@ StreamingDataWidthConverter_rtl, ) from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO +from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker @@ -111,3 +112,4 @@ custom_op["DuplicateStreams"] = DuplicateStreams custom_op["GlobalAccPool"] = GlobalAccPool custom_op["LabelSelect"] = LabelSelect +custom_op["StreamingMaxPool"] = StreamingMaxPool diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 66a5d7b53c..96d0e6f6a9 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -32,6 +32,7 @@ from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls +from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls custom_op = dict() @@ -43,3 +44,4 @@ custom_op["FMPadding_hls"] = FMPadding_hls custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls custom_op["LabelSelect_hls"] = LabelSelect_hls +custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py new file mode 100755 index 0000000000..eb3284a343 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py @@ -0,0 +1,300 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from qonnx.core.datatype import DataType +from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class StreamingMaxPool_hls(StreamingMaxPool, HLSBackend): + """Class that corresponds to finn-hlslib StreamingMaxPool_batch function.""" + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingMaxPool.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""StreamingMaxPool_Batch needs 1 data input""") + + return info_messages + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] + + def defines(self, var): + numReps = 1 + ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() + ceil_mode = self.get_nodeattr("CeilMode") + output_size = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) + + if self.is_1d(): + self.code_gen_dict["$DEFINES$"] = [ + """#define ImgDim {}\n #define PoolDim {}\n + #define NumChannels {}\n #define PE {}\n #define OutputSize {} + \n #define numReps {}""".format( + ifm_dim[1], + k[1], + self.get_nodeattr("NumChannels"), + self.get_nodeattr("PE"), + output_size, + numReps, + ) + ] + else: + self.code_gen_dict["$DEFINES$"] = [ + """#define ImgDim {}\n #define PoolDim {}\n + #define NumChannels {}\n #define numReps {}""".format( + ifm_dim[1], + k[1], + self.get_nodeattr("NumChannels"), + numReps, + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + dtype = self.get_input_datatype() + if dtype.bitwidth() == 1: + if self.is_1d(): + raise Exception("Binary 1d MaxPool not implemented on HLS backend") + else: + op = "StreamingMaxPool" + self.code_gen_dict["$DOCOMPUTE$"] = [ + "%s(in0_%s, out_%s);" + % (op, self.hls_sname(), self.hls_sname()) + ] + else: + dtype = self.get_input_datatype() + dtype_hls = dtype.get_hls_datatype_str() + minval_str = str(int(dtype.min())) + if self.is_1d(): + op = "StreamingMaxPool_Precision_1d" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """%s(in0_%s, out_%s);""" + % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname()) + ] + else: + op = "StreamingMaxPool_Precision" + self.code_gen_dict["$DOCOMPUTE$"] = [ + "%s(in0_%s, out_%s);" + % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname()) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim \ + did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim, ofm_dim, ifm_ch).""" diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py new file mode 100755 index 0000000000..0f85a22993 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py @@ -0,0 +1,229 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import onnxruntime as rt +import warnings +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim +from qonnx.util.basic import qonnx_make_model + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + +# TODO: consider splitting this into separate implementations for 1D and 2D +# similar to what we do for ConvolutionInputGenerator + + +class StreamingMaxPool(HWCustomOp): + """Abstraction layer for HW implementation of StreamingMaxPool""" + + def get_nodeattr_types(self): + my_attrs = { + "ImgDim": ("ints", True, []), # [H, W] = [Y, X] + "PoolDim": ("ints", True, []), # [H, W] = [Y, X] + "NumChannels": ("i", True, 0), + # parallelism control - only supported for 1D maxpool + "PE": ("i", False, 0), + # round up (instead of down) output size - only supported for 1D maxpool + "CeilMode": ("i", False, 0), + # FINN DataTypes for inputs/outputs + "dataType": ("s", True, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("dataType")] + + def get_1d_attrs_normalized(self): + # support both (1, D) and (D, 1) cases transparently: + # assume the dummy ('1') dimension is the Y-dimension, i.e. + # images and kernels (and their attributes) of dimension + # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D] + ifm_dim = self.get_nodeattr("ImgDim") + k = self.get_nodeattr("PoolDim") + ifm_ch = self.get_nodeattr("NumChannels") + if ifm_dim[1] == 1: + ifm_dim = ifm_dim[::-1] + k = k[::-1] + return (ifm_dim, k, ifm_ch) + + def is_1d(self): + ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() + return (ifm_dim[0] == 1) and (k[0] == 1) + + def get_normal_input_shape(self, ind=0): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") + ifm_ch = self.get_nodeattr("NumChannels") + ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) + return ishape + + def get_folded_input_shape(self, ind=0): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") + ifm_ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + nf = int(ifm_ch / pe) + if self.is_1d(): + folded_ishape = (1, ifm_dim_h, ifm_dim_w, nf, pe) + else: + folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") + k_h, k_w = tuple(self.get_nodeattr("PoolDim")) + ifm_ch = self.get_nodeattr("NumChannels") + ceil_mode = self.get_nodeattr("CeilMode") + if not self.is_1d(): + assert ifm_dim_h % k_h == 0, "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0" + assert ifm_dim_w % k_w == 0, "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0" + ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, k_h, 0, ceil_mode) + ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, k_w, 0, ceil_mode) + oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch) + return oshape + + def get_folded_output_shape(self, ind=0): + # even though there is no folding in the current hlslib op, + # insert a time multiplexing axis to remain compatible with the + # shapes produced by the rest of the dataflow pipeline + ifm_ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + nf = int(ifm_ch / pe) + ret = list(self.get_normal_output_shape()) + if self.is_1d(): + ret[-1] = nf + ret.append(pe) + else: + ret.insert(-1, 1) + return tuple(ret) + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def get_exp_cycles(self): + # derived from StreamingMaxPool_Batch loop nest + ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() + + warnings.warn( + """Estimated latency for layer {} can be lower than + actual latency!""".format( + self.onnx_node.name + ) + ) + if self.is_1d(): + _, _, _, nf, _ = self.get_folded_output_shape() + ceil_mode = self.get_nodeattr("CeilMode") + ofm_dim = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) + exp_cycles = ofm_dim * nf * (k[1] + 1) + return int(exp_cycles) + else: + # TODO: adjust inaccurate formula + return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1]))) + + def get_instream_width(self, ind=0): + dt_bits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + ifm_ch = self.get_nodeattr("NumChannels") + if self.is_1d(): + in_width = int(dt_bits * pe) + else: + in_width = int(dt_bits * ifm_ch) + return in_width + + def get_outstream_width(self, ind=0): + """For streaming maxpool out stream width is the same as in stream width""" + return self.get_instream_width() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for StreamingMaxPool." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("dataType", idt.name) + # data type stays the same + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + pass + + def execute_node(self, context, graph): + # create a standard add node to help calculate the result + node = self.onnx_node + kernel_shape = self.get_nodeattr("PoolDim") + inp_values = context[node.input[0]] + dummy_out = context[node.output[0]] + # convert i/o NHWC -> NCHW + inp_values = np.transpose(inp_values, (0, 3, 1, 2)) + dummy_out = np.transpose(dummy_out, (0, 3, 1, 2)) + # execute as regular MaxPool + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, inp_values.shape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, dummy_out.shape) + node_mp = helper.make_node( + "MaxPool", + inputs=[node.input[0]], + outputs=[node.output[0]], + kernel_shape=kernel_shape, + strides=kernel_shape, + ) + graph_mp = helper.make_graph( + nodes=[node_mp], + name="single-mp-exec", + inputs=[inp], + outputs=[outp], + ) + + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_mp = qonnx_make_model(graph_mp, **onnx_kwargs) + idict = {node.input[0]: inp_values} + sess = rt.InferenceSession(model_mp.SerializeToString()) + result = sess.run(None, idict) + result = np.asarray(result, dtype=np.float32).reshape(dummy_out.shape) + # convert output NCHW -> NHWC + result = np.transpose(result, (0, 2, 3, 1)) + context[node.output[0]] = result diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index e3813eb709..7a896f5c96 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -32,6 +32,7 @@ import warnings from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from qonnx.transformation.general import SortGraph from qonnx.transformation.infer_datatypes import InferDataTypes @@ -39,6 +40,61 @@ from qonnx.util.onnx import nchw_to_nhwc +class InferStreamingMaxPool(Transformation): + """Convert MaxPoolNHWC layers to StreamingMaxPool HW layers.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "MaxPoolNHWC": + mp_input = node.input[0] + mp_output = node.output[0] + mp_in_shape = model.get_tensor_shape(mp_input) + # mp_out_shape = model.get_tensor_shape(mp_output) + dt = model.get_tensor_datatype(mp_input) + mp_inst = getCustomOp(node) + k_h, k_w = mp_inst.get_nodeattr("kernel_shape") + ifm_ch = mp_in_shape[-1] + ifm_dim_h = mp_in_shape[1] + ifm_dim_w = mp_in_shape[2] + pe = 1 + ceil_mode = mp_inst.get_nodeattr("ceil_mode") + is_1d = (ifm_dim_h == 1 and k_h == 1) or (ifm_dim_w == 1 and k_w == 1) + is_divisable = (ifm_dim_h % k_h == 0) or (ifm_dim_w % k_w == 0) + is_bipolar = dt == DataType["BIPOLAR"] + pass_1d = is_1d and (not is_bipolar) + pass_2d = (not is_1d) and is_divisable + if pass_1d or pass_2d: + # create equivalent StreamingMaxPool_Batch node + new_node = helper.make_node( + "StreamingMaxPool", + [mp_input], + [mp_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + PoolDim=(k_h, k_w), + NumChannels=ifm_ch, + ImgDim=(ifm_dim_h, ifm_dim_w), + dataType=dt.name, + PE=pe, + CeilMode=ceil_mode, + name="StreamingMaxPool_" + node.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(node) + graph_modified = True + else: + warnings.warn(node.name + ": could not convert to HW") + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferAddStreamsLayer(Transformation): """Convert any Add into a AddStreams HW layer.""" diff --git a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py index 67a40d96f3..643187cf66 100644 --- a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,12 +41,13 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferStreamingMaxPool +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferStreamingMaxPool from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode): @@ -138,10 +140,16 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil model = golden.transform(InferStreamingMaxPool()) model = model.transform(InferShapes()) - assert model.graph.node[0].op_type == "StreamingMaxPool_Batch" + assert model.graph.node[0].op_type == "StreamingMaxPool" + + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all() + + model = model.transform(SpecializeLayers()) # Ensure PE value is set - streamingmaxpool_node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0] + streamingmaxpool_node = model.get_nodes_by_op_type("StreamingMaxPool_hls")[0] getCustomOp(streamingmaxpool_node).set_nodeattr("PE", pe) if exec_mode == "cppsim": @@ -162,7 +170,7 @@ def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0] + node = model.get_nodes_by_op_type("StreamingMaxPool_hls")[0] # inst = getCustomOp(node) # cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) From 30c027e49c73d35e9230f83c8f8f91c2c4ca068a Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 1 Dec 2023 12:09:33 +0000 Subject: [PATCH 021/291] [StreamingMaxPool] Fix execution hw layer for 1d case --- src/finn/custom_op/fpgadataflow/streamingmaxpool.py | 5 +++++ tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py index 0f85a22993..1c2622c3d2 100755 --- a/src/finn/custom_op/fpgadataflow/streamingmaxpool.py +++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py @@ -199,6 +199,11 @@ def execute_node(self, context, graph): # convert i/o NHWC -> NCHW inp_values = np.transpose(inp_values, (0, 3, 1, 2)) dummy_out = np.transpose(dummy_out, (0, 3, 1, 2)) + # handle 1d case + ishape = inp_values.shape + if ishape[2] == 1 or ishape[3] == 1: + inp_values = inp_values.reshape(ishape[0], ishape[1], ishape[2] * ishape[3]) + kernel_shape = [kernel_shape[0] * kernel_shape[1]] # execute as regular MaxPool inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, inp_values.shape) outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, dummy_out.shape) diff --git a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py index 643187cf66..0df7181a60 100644 --- a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py @@ -94,7 +94,7 @@ def prepare_inputs(input_tensor): # input dimension @pytest.mark.parametrize("ifm_dim", [4, 10]) # input channels -@pytest.mark.parametrize("ifm_ch", [1, 3]) # 1,3 +@pytest.mark.parametrize("ifm_ch", [1, 3]) # pe @pytest.mark.parametrize("pe", [1, 3]) # ceil mode From b95c142bb9321c4986f2238a4e9ce1d4a8882b46 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 1 Dec 2023 14:21:48 +0000 Subject: [PATCH 022/291] [StreamingMaxPool] Fix bug in execution and restrict conversion to hw layer --- src/finn/custom_op/fpgadataflow/streamingmaxpool.py | 2 ++ .../transformation/fpgadataflow/convert_to_hw_layers.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py index 1c2622c3d2..59a8f092d0 100755 --- a/src/finn/custom_op/fpgadataflow/streamingmaxpool.py +++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py @@ -194,6 +194,7 @@ def execute_node(self, context, graph): # create a standard add node to help calculate the result node = self.onnx_node kernel_shape = self.get_nodeattr("PoolDim") + ceil_mode = self.get_nodeattr("CeilMode") inp_values = context[node.input[0]] dummy_out = context[node.output[0]] # convert i/o NHWC -> NCHW @@ -213,6 +214,7 @@ def execute_node(self, context, graph): outputs=[node.output[0]], kernel_shape=kernel_shape, strides=kernel_shape, + ceil_mode=ceil_mode, ) graph_mp = helper.make_graph( nodes=[node_mp], diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 7a896f5c96..289b4edd5c 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -53,10 +53,15 @@ def apply(self, model): mp_input = node.input[0] mp_output = node.output[0] mp_in_shape = model.get_tensor_shape(mp_input) - # mp_out_shape = model.get_tensor_shape(mp_output) dt = model.get_tensor_datatype(mp_input) mp_inst = getCustomOp(node) k_h, k_w = mp_inst.get_nodeattr("kernel_shape") + s_h, s_w = mp_inst.get_nodeattr("strides") + if k_h != s_h or k_w != s_w: + warn_str = """Stride is not equal to kernel. Node cannot be converted to + StreamingMaxPool layer.""" + warnings.warn(warn_str) + continue ifm_ch = mp_in_shape[-1] ifm_dim_h = mp_in_shape[1] ifm_dim_w = mp_in_shape[2] From 0b0beebb49cce5dbc1ac0607387717a12ebe77cc Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 1 Dec 2023 15:43:55 +0000 Subject: [PATCH 023/291] [Upsampler] Initial draft of upsampler in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 4 +- .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../fpgadataflow/hls/upsampler_hls.py | 254 +++++++++++++ src/finn/custom_op/fpgadataflow/upsampler.py | 246 +++--------- .../custom_op/fpgadataflow/upsampler_batch.py | 351 ++++++++++++++++++ .../fpgadataflow/convert_to_hw_layers.py | 98 +++++ .../test_fpgadataflow_upsampler.py | 43 +-- 7 files changed, 765 insertions(+), 233 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py create mode 100644 src/finn/custom_op/fpgadataflow/upsampler_batch.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 0a92b99fd4..9e90616ff8 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -71,7 +71,8 @@ from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker -from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch +from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour +from finn.custom_op.fpgadataflow.upsampler_batch import UpsampleNearestNeighbour_Batch from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation custom_op = dict() @@ -113,3 +114,4 @@ custom_op["GlobalAccPool"] = GlobalAccPool custom_op["LabelSelect"] = LabelSelect custom_op["StreamingMaxPool"] = StreamingMaxPool +custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 96d0e6f6a9..f800054bfd 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -33,6 +33,7 @@ from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls +from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls custom_op = dict() @@ -45,3 +46,4 @@ custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls custom_op["LabelSelect_hls"] = LabelSelect_hls custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls +custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py new file mode 100644 index 0000000000..89a474a5d3 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py @@ -0,0 +1,254 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class UpsampleNearestNeighbour_hls(UpsampleNearestNeighbour, HLSBackend): + """ + Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function. + Upsampling is done with the Nearest Neighbour algorithm. + The layer expects square feature maps for the in and output. + """ + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(UpsampleNearestNeighbour.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + pass + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("NumChannels") + self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] + + ibits = self.get_input_datatype().bitwidth() + self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] + + idim = self.get_nodeattr("IFMDim") + self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] + + odim = self.get_nodeattr("OFMDim") + self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] + + batch_size = self.get_nodeattr("numInputVectors") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + is_2d = self.get_nodeattr("DimMode") == 0 + batch = self.get_nodeattr("numInputVectors") + if is_2d: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """UpsampleNearestNeighbour_Batch > (in0_%s, out_%s, numReps);""" + % (self.hls_sname(), self.hls_sname()) + ] + else: + assert batch == 1, "1D upsampler currently needs numReps=1" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """UpsampleNearestNeighbour_1D > (in0_%s, out_%s);""" + % (self.hls_sname(), self.hls_sname()) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" + export_idt = self.get_input_datatype() + self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim, OutputDim, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py index 9c0db1f3df..b0264ffa8a 100644 --- a/src/finn/custom_op/fpgadataflow/upsampler.py +++ b/src/finn/custom_op/fpgadataflow/upsampler.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,19 +27,17 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np +import onnxruntime as rt import warnings +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class UpsampleNearestNeighbour_Batch(HLSCustomOp): - """ - Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function. - Upsampling is done with the Nearest Neighbour algorithm. - The layer expects square feature maps for the in and output. - """ +class UpsampleNearestNeighbour(HWCustomOp): + """Abstraction layer for HW implementation of UpsampleNearestNeighbour.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -150,202 +148,44 @@ def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - ifm_ch = self.get_nodeattr("NumChannels") - self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] - - ibits = self.get_input_datatype().bitwidth() - self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] - - idim = self.get_nodeattr("IFMDim") - self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] - - odim = self.get_nodeattr("OFMDim") - self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] - - batch_size = self.get_nodeattr("numInputVectors") - self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - is_2d = self.get_nodeattr("DimMode") == 0 - batch = self.get_nodeattr("numInputVectors") - if is_2d: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """UpsampleNearestNeighbour_Batch > (in0_%s, out_%s, numReps);""" - % (self.hls_sname(), self.hls_sname()) - ] - else: - assert batch == 1, "1D upsampler currently needs numReps=1" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """UpsampleNearestNeighbour_1D > (in0_%s, out_%s);""" - % (self.hls_sname(), self.hls_sname()) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - packed_hls_type, - self.hls_sname(), - packed_hls_type, - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # create a standard add node to help calculate the result node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_oshape = self.get_folded_output_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + inp_values = context[node.input[0]] + ishape = inp_values.shape + odim = self.get_nodeattr("OFMDim") + idim = self.get_nodeattr("IFMDim") + if ishape[1] == ishape[2]: + scales_val = [1, int(round(odim / idim)), int(round(odim / idim)), 1] + elif ishape[1] > 1 and ishape[2] == 1: + scales_val = [1, int(round(odim / idim)), 1, 1] else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) + warnings.warn( + """HW abstraction layer for Upsample cannot be executed. + Upsampling only supported for 1D H, or 2D square scaling""" ) + oshape = context[node.output[0]].shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + scales = helper.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + node_resize = helper.make_node( + "Resize", + inputs=[node.input[0], "", "scales"], + outputs=[node.output[0]], + mode="nearest", + ) + graph_resize = helper.make_graph( + nodes=[node_resize], + name="single-resize-exec", + inputs=[inp, scales], + outputs=[outp], + ) - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" - export_idt = self.get_input_datatype() - self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == folded_oshape - ), "cppsim did not produce expected folded output shape" - context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim, OutputDim, NumChannels).""" + opset_version = 13 + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_resize = qonnx_make_model(graph_resize, **onnx_kwargs) + idict = {node.input[0]: inp_values, "scales": scales_val} + sess = rt.InferenceSession(model_resize.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/upsampler_batch.py b/src/finn/custom_op/fpgadataflow/upsampler_batch.py new file mode 100644 index 0000000000..9c0db1f3df --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/upsampler_batch.py @@ -0,0 +1,351 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class UpsampleNearestNeighbour_Batch(HLSCustomOp): + """ + Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function. + Upsampling is done with the Nearest Neighbour algorithm. + The layer expects square feature maps for the in and output. + """ + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # Size of the output feature map + "OFMDim": ("i", True, 0), + # Size of the input feature map + "IFMDim": ("i", True, 0), + # Amount of channels of the input feature map + "NumChannels": ("i", True, 0), + # FINN input datatype + "inputDataType": ("s", True, ""), + # Batch size + "numInputVectors": ("i", False, 1), + # Dimensionality mode: 0 = 2D square, 1 = 1D in H dim + "DimMode": ("i", False, 0), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_exp_cycles(self): + OFMDim = self.get_nodeattr("OFMDim") + batch_size = self.get_nodeattr("numInputVectors") + is_2d = self.get_nodeattr("DimMode") == 0 + reps = 1 + if is_2d: + OFMDim = OFMDim * OFMDim + reps = batch_size + exp_cycles = OFMDim * reps + return int(exp_cycles) + + def get_normal_input_shape(self, ind=0): + IFMDim = self.get_nodeattr("IFMDim") + num_ch = self.get_nodeattr("NumChannels") + batch = self.get_nodeattr("numInputVectors") + is_2d = self.get_nodeattr("DimMode") == 0 + if is_2d: + ishape = (batch, IFMDim, IFMDim, num_ch) + else: + ishape = (batch, IFMDim, 1, num_ch) + return ishape + + def get_normal_output_shape(self, ind=0): + OFMDim = self.get_nodeattr("OFMDim") + num_ch = self.get_nodeattr("NumChannels") + batch = self.get_nodeattr("numInputVectors") + is_2d = self.get_nodeattr("DimMode") == 0 + if is_2d: + oshape = (batch, OFMDim, OFMDim, num_ch) + else: + oshape = (batch, OFMDim, 1, num_ch) + return oshape + + def get_folded_input_shape(self, ind=0): + normal_ishape = list(self.get_normal_input_shape()) + return tuple(normal_ishape) + + def get_folded_output_shape(self, ind=0): + normal_oshape = list(self.get_normal_output_shape()) + return tuple(normal_oshape) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for UpsampleNearestNeighbour_Batch." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + ret = DataType[self.get_nodeattr("inputDataType")] + return ret + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output. (Same as input datatype)""" + return self.get_input_datatype() + + def get_instream_width(self, ind=0): + ibits = self.get_input_datatype().bitwidth() + ifm_ch = self.get_nodeattr("NumChannels") + return ibits * ifm_ch + + def get_outstream_width(self, ind=0): + obits = self.get_output_datatype().bitwidth() + ifm_ch = self.get_nodeattr("NumChannels") + return obits * ifm_ch + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("NumChannels") + self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] + + ibits = self.get_input_datatype().bitwidth() + self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] + + idim = self.get_nodeattr("IFMDim") + self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] + + odim = self.get_nodeattr("OFMDim") + self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] + + batch_size = self.get_nodeattr("numInputVectors") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + is_2d = self.get_nodeattr("DimMode") == 0 + batch = self.get_nodeattr("numInputVectors") + if is_2d: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """UpsampleNearestNeighbour_Batch > (in0_%s, out_%s, numReps);""" + % (self.hls_sname(), self.hls_sname()) + ] + else: + assert batch == 1, "1D upsampler currently needs numReps=1" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """UpsampleNearestNeighbour_1D > (in0_%s, out_%s);""" + % (self.hls_sname(), self.hls_sname()) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" + export_idt = self.get_input_datatype() + self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim, OutputDim, NumChannels).""" diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 289b4edd5c..1c2dfeca96 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -37,9 +37,107 @@ from qonnx.transformation.general import SortGraph from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import get_by_name from qonnx.util.onnx import nchw_to_nhwc +class InferUpsample(Transformation): + """Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour nodes.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "Upsample" or n.op_type == "Resize": + # Extract mode and scales and input shape + mode = get_by_name(n.attribute, "mode").s.decode("ascii") + if n.op_type == "Upsample": + scales = model.get_initializer(n.input[1]) + else: + scales = model.get_initializer(n.input[2]) + in_shape = model.get_tensor_shape(n.input[0]) + + dt = model.get_tensor_datatype(n.input[0]) + if not dt.is_integer(): + warnings.warn( + "%s: Input not int. Can't infer UpsampleNearestNeighbour." % n.name + ) + continue + + if model.get_tensor_layout(n.input[0]) != DataLayout.NHWC: + warnings.warn( + "%s: Input not NHWC. Can't infer UpsampleNearestNeighbour." % n.name + ) + continue + + # Check that the parameters are okay + assert mode == "nearest", ( + "%s: Upsampling is only supported for the mode nearest." % n.name + ) + assert len(in_shape) == 4, "Upsampling is only supported for 4D inputs." + assert scales.shape == (4,), ( + "%s: Upsampling is only supported for 4D scales." % n.name + ) + assert (scales >= 1).all(), ( + n.name + ": Upsampling is only supported for scales " + "which are larger or equal 1 in all dimensions." + ) + + # Assumes nhwc layout for scales and input + is_scale_square_2d = scales[1] == scales[2] + is_scale_1d = scales[1] > 1 and scales[2] == 1 + assert is_scale_square_2d or is_scale_1d, ( + "%s: Upsampling only supported for 1D H, or 2D square scaling" % n.name + ) + assert scales[0] == scales[3] == 1, ( + n.name + ": Upsampling is only supported for scales with " + "the first and last dimensions being 1 in NHWC." + ) + spatial_scale = scales[1] + assert spatial_scale == int(spatial_scale), ( + "%s: Upsampling is only supported for integer scales." % n.name + ) + is_shape_square_2d = in_shape[1] == in_shape[2] + is_shape_1d = in_shape[1] > 1 and in_shape[2] == 1 + + assert is_shape_square_2d or is_shape_1d, ( + "%s: Upsampling is only supported for 1D H or 2D square inputs." % n.name + ) + + # Extract information for HW node + IFMDim = in_shape[1] + OFMDim = int(round(in_shape[1] * spatial_scale)) + NumChannels = in_shape[-1] + numInputVectors = in_shape[0] + inputDataType = dt.name + dim_mode = 0 if is_shape_square_2d else 1 + + # Insert the HWCustomOp node + Upsample_HW_node = helper.make_node( + "UpsampleNearestNeighbour", + [n.input[0]], + [n.output[0]], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + OFMDim=OFMDim, + IFMDim=IFMDim, + NumChannels=NumChannels, + inputDataType=inputDataType, + numInputVectors=numInputVectors, + DimMode=dim_mode, + name="UpsampleNearestNeighbour_" + n.name, + ) + + # Remove the old node + graph.node.insert(node_ind, Upsample_HW_node) + # remove old nodes + graph.node.remove(n) + graph_modified = True + return (model, graph_modified) + + class InferStreamingMaxPool(Transformation): """Convert MaxPoolNHWC layers to StreamingMaxPool HW layers.""" diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py index 70d81c7d31..b0da767eaa 100644 --- a/tests/fpgadataflow/test_fpgadataflow_upsampler.py +++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py @@ -48,12 +48,13 @@ import finn.core.onnx_exec as oxe import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferUpsample +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferUpsample from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.util.basic import make_build_dir @@ -84,29 +85,6 @@ def apply(self, model): _to_chan_first_args = (0, 3, 1, 2) -class TransposeUpsampleIO(Transformation): - """ - Converts the inputs outputs for all Upsample and Resize nodes - from NCHW to NHWC. - """ - - def apply(self, model): - graph = model.graph - for n in graph.node: - if n.op_type == "Upsample" or n.op_type == "Resize": - # Set input shape - inp = n.input[0] - NCHW_shape = model.get_tensor_shape(inp) - NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args] - model.set_tensor_shape(inp, NHWC_shape) - # Set output shape - out = n.output[0] - NCHW_shape = model.get_tensor_shape(out) - NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args] - model.set_tensor_shape(out, NHWC_shape) - return model, False - - class PyTorchTestModel(nn.Module): def __init__(self, upscale_factor=2): super(PyTorchTestModel, self).__init__() @@ -173,7 +151,6 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d # Prep model for execution model = ModelWrapper(export_path) - # model = model.transform(TransposeUpsampleIO()) model = model.transform(MakeInputChannelsLast()) model = model.transform(InferDataLayouts()) model = model.transform(absorb.AbsorbTransposeIntoResize()) @@ -186,8 +163,18 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d # Check that all nodes are UpsampleNearestNeighbour_Batch nodes for n in model.get_finn_nodes(): - node_check = n.op_type == "UpsampleNearestNeighbour_Batch" - assert node_check, "All nodes should be UpsampleNearestNeighbour_Batch nodes." + node_check = n.op_type == "UpsampleNearestNeighbour" + assert node_check, "All nodes should be UpsampleNearestNeighbour nodes." + + test_in_transposed = test_in.numpy().transpose(_to_chan_last_args) + input_dict = {model.graph.input[0].name: test_in_transposed} + + # Run sim + output_dict = oxe.execute_onnx(model, input_dict, True) + test_result = output_dict[model.graph.output[0].name] + output_matches = np.isclose(golden_result, test_result, atol=atol).all() + + model = model.transform(SpecializeLayers()) # Prep sim if exec_mode == "cppsim": @@ -204,8 +191,6 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d raise Exception("Unknown exec_mode") # Run sim - test_in_transposed = test_in.numpy().transpose(_to_chan_last_args) - input_dict = {model.graph.input[0].name: test_in_transposed} output_dict = oxe.execute_onnx(model, input_dict, True) test_result = output_dict[model.graph.output[0].name] output_matches = np.isclose(golden_result, test_result, atol=atol).all() From 807bad1f52a0672863e9182a6396186d5f2243e1 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 1 Dec 2023 16:37:16 +0000 Subject: [PATCH 024/291] [Eltwise] Initial draft of upsampler in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 4 +- .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../fpgadataflow/hls/streamingeltwise_hls.py | 336 ++++++++++++++++++ .../fpgadataflow/streamingeltwise.py | 216 +++++++++++ .../fpgadataflow/convert_to_hw_layers.py | 95 +++++ .../fpgadataflow/test_fpgadataflow_eltwise.py | 16 +- 6 files changed, 664 insertions(+), 5 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py create mode 100644 src/finn/custom_op/fpgadataflow/streamingeltwise.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 9e90616ff8..6fe7993643 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -45,7 +45,6 @@ from finn.custom_op.fpgadataflow.downsampler import DownSampler from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch -from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise from finn.custom_op.fpgadataflow.fmpadding import FMPadding from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl @@ -66,6 +65,7 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_rtl import ( StreamingDataWidthConverter_rtl, ) +from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch @@ -104,7 +104,6 @@ custom_op["Lookup"] = Lookup custom_op["StreamingConcat"] = StreamingConcat custom_op["CheckSum"] = CheckSum -custom_op["StreamingEltwise"] = StreamingEltwise custom_op["FMPadding_rtl"] = FMPadding_rtl custom_op["FMPadding"] = FMPadding @@ -113,5 +112,6 @@ custom_op["DuplicateStreams"] = DuplicateStreams custom_op["GlobalAccPool"] = GlobalAccPool custom_op["LabelSelect"] = LabelSelect +custom_op["StreamingEltwise"] = StreamingEltwise custom_op["StreamingMaxPool"] = StreamingMaxPool custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index f800054bfd..df58decf81 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -32,6 +32,7 @@ from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls +from finn.custom_op.fpgadataflow.hls.streamingeltwise_hls import StreamingEltwise_hls from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls @@ -45,5 +46,6 @@ custom_op["FMPadding_hls"] = FMPadding_hls custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls custom_op["LabelSelect_hls"] = LabelSelect_hls +custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py new file mode 100644 index 0000000000..2aec40f988 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py @@ -0,0 +1,336 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class StreamingEltwise_hls(StreamingEltwise, HLSBackend): + """Class that corresponds to finn-hlslib StreamingEltwise function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingEltwise.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType0") + self.get_nodeattr("inputDataType1") + self.get_nodeattr("eltwiseOp") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required StreamingEltwise attributes do not exist.""") + + return info_messages + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == exp_ishape, """Input0 shape doesn't match expected shape .""" + export_idt0 = self.get_input_datatype(0) + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + # exact same thing for input1 + inp = context[node.input[1]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == exp_ishape, """Input1 shape doesn't match expected shape .""" + export_idt1 = self.get_input_datatype(1) + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_1.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits0 = self.get_instream_width(0) + nbits1 = self.get_instream_width(1) + rtlsim_inp0 = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt0, nbits0 + ) + rtlsim_inp1 = npy_to_rtlsim_input( + "{}/input_1.npy".format(code_gen_dir), export_idt1, nbits1 + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = [ + '#include "eltwise.hpp"', + '#include "interpret.hpp"', + ] + + self.code_gen_dict["$GLOBALS$"].extend( + [ + "template", + "struct absdiff {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a>b? a-b : b-a;", + "}", + "};", + "template", + "struct sub {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a-b;", + "}", + "};", + "template", + "struct add {", + "TO operator()(TI1 const &a, TI2 const &b) const {", + "#pragma HLS inline", + "return a+b;", + "}", + "};", + ] + ) + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + elem_bits_0 = idt0.bitwidth() + elem_bits_1 = idt1.bitwidth() + packed_bits_0 = self.get_instream_width(0) + packed_hls_type_0 = "ap_uint<%d>" % packed_bits_0 + packed_bits_1 = self.get_instream_width(1) + packed_hls_type_1 = "ap_uint<%d>" % packed_bits_1 + elem_hls_type_0 = idt0.get_hls_datatype_str() + elem_hls_type_1 = idt1.get_hls_datatype_str() + npy_type = "float" + self.code_gen_dict["$READNPYDATA$"] = [] + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type_0, + elem_hls_type_0, + elem_bits_0, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + npy_in = "%s/input_1.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in1_%s);' + % ( + packed_hls_type_1, + elem_hls_type_1, + elem_bits_1, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(0), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in1_{} ("in1_{}");'.format( + self.get_instream_width(1), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + odt = self.get_output_datatype() + elem_hls_type_0 = idt0.get_hls_datatype_str() + elem_hls_type_1 = idt1.get_hls_datatype_str() + out_hls_type = odt.get_hls_datatype_str() + slice_in0 = "Slice<%s>" % elem_hls_type_0 + slice_in1 = "Slice<%s>" % elem_hls_type_1 + slice_out = "Slice<%s>" % out_hls_type + eltwise_op_str = self.get_eltwise_op_lambda() + "%sEltwiseFunction<%s, %s, %s>()" % ( + op, + elem_hls_type_0, + elem_hls_type_1, + out_hls_type, + ) + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{}<{}, {}, {}, {}, {}, {}>(in0_{}, in1_{}, out_{}, {});""".format( + "StreamingEltwise", + self.get_nodeattr("NumChannels"), + self.get_nodeattr("PE"), + int(np.prod(self.get_folded_output_shape()[:-2])), + slice_in0, + slice_in1, + slice_out, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + eltwise_op_str, + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, + hls::stream> &out_{})""".format( + self.onnx_node.name, + self.get_nodeattr("PE") * self.get_input_datatype(0).bitwidth(), + self.hls_sname(), + self.get_nodeattr("PE") * self.get_input_datatype(1).bitwidth(), + self.hls_sname(), + self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(), + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=in1_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/streamingeltwise.py b/src/finn/custom_op/fpgadataflow/streamingeltwise.py new file mode 100644 index 0000000000..4681c144f7 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/streamingeltwise.py @@ -0,0 +1,216 @@ +# Copyright (c) 2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class StreamingEltwise(HWCustomOp): + """Abstraction layer for HW implementation of StreamingEltwise""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType0": ("s", True, ""), + "inputDataType1": ("s", True, ""), + # type of EltwiseFunction for the operation + "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + "inFIFODepths": ("ints", False, [2, 2]), + } + ) + return my_attrs + + def get_eltwise_op_lambda(self): + eltwise_op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + odt = self.get_output_datatype() + tin0 = idt0.get_hls_datatype_str() + tin1 = idt1.get_hls_datatype_str() + tout = odt.get_hls_datatype_str() + eltwise_ops = { + # "Add": "[](auto a, auto b) { return a + b; }", + # "Sub": "[](auto a, auto b) { return a - b; }", + # "AbsDiff": "[](auto a, auto b) { return a>b? a-b : b-a; }", + "Add": f"add<{tin0}, {tin1}, {tout}>()", + "Sub": f"sub<{tin0}, {tin1}, {tout}>()", + "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()", + } + return eltwise_ops[eltwise_op] + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich]) + return ishape + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + assert ich % pe == 0, "PE must divide NumChannels" + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich // pe, pe]) + return ishape + + def get_normal_output_shape(self, ind=0): + return self.get_normal_input_shape() + + def get_folded_output_shape(self, ind=0): + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input1 shape." + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) + assert ishape == exp_ishape, "Unexpected input2 shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt0 = model.get_tensor_datatype(node.input[0]) + if idt0 != self.get_input_datatype(0): + warn_str = "inputDataType0 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(0)), + str(idt0), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType0", idt0.name) + idt1 = model.get_tensor_datatype(node.input[1]) + if idt1 != self.get_input_datatype(1): + warn_str = "inputDataType1 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(1)), + str(idt1), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType1", idt1.name) + # enforce output data type (calculated based on idt) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType" + str(ind))] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + assert idt0.signed() == idt1.signed(), ( + "%s: Inputs must have same signedness" % self.onnx_node.name + ) + idt0_min, idt0_max = idt0.min(), idt0.max() + idt1_min, idt1_max = idt1.min(), idt1.max() + cands = [ + idt0_min - idt1_min, + idt0_min - idt1_max, + idt0_max - idt1_min, + idt0_max - idt1_max, + ] + largest_magnitude = max(map(abs, cands)) + if op == "Add": + if idt0.signed(): + return DataType.get_smallest_possible(idt0.min() + idt1.min()) + else: + return DataType.get_smallest_possible(idt0.max() + idt1.max()) + elif op == "Sub": + return DataType.get_smallest_possible(-largest_magnitude) + elif op == "AbsDiff": + return DataType.get_smallest_possible(largest_magnitude) + else: + raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op)) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype(ind).bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + # simulate behavior using Python + node = self.onnx_node + inp0_values = context[node.input[0]] + inp1_values = context[node.input[1]] + eltwiseOp = self.get_nodeattr("eltwiseOp") + oshape = context[node.output[0]].shape + ishape0 = inp0_values.shape + ishape1 = inp1_values.shape + assert ishape0 == ishape1, "Shapes of inputs should be the same for Streamingeltwise" + # subtraction + result = inp0_values - inp1_values + if eltwiseOp == "Sub": + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) + elif eltwiseOp == "AbsDiff": + context[node.output[0]] = np.abs(np.asarray(result, dtype=np.float32)).reshape(oshape) + else: + raise Exception("%s: Unknown eltWiseOp = %s" % (node.name, eltwiseOp)) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + sname = self.hls_sname() + swidth = self.get_instream_width_padded() + intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] + return intf_names diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 1c2dfeca96..11bd3406d5 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -662,3 +662,98 @@ def apply(self, model): model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) + + +class InferStreamingEltwise(Transformation): + """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer + with SubEltwise or AbsDiffEltwise op.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Sub": + in0 = node.input[0] + in1 = node.input[1] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + in1_shape = model.get_tensor_shape(in1) + in0_static = not (model.get_initializer(in0) is None) + in1_static = not (model.get_initializer(in1) is None) + + # skip if different shapes on inputs + if in0_shape != in1_shape: + continue + # skip if any of inputs have initializers + # (this node is meant for two dynamic streams) + if in0_static or in1_static: + continue + + idt0 = model.get_tensor_datatype(in0) + idt1 = model.get_tensor_datatype(in1) + + # skip conversion for layers with float input + if not (idt0.is_integer() and idt1.is_integer()): + continue + + eltwiseOp = "Sub" + nodes_to_remove = [node] + # look for a downstream Abs node + res_consumer = model.find_consumer(result) + if (res_consumer is not None) and (res_consumer.op_type == "Abs"): + eltwiseOp = "AbsDiff" + result = res_consumer.output[0] + nodes_to_remove.append(res_consumer) + + # check layout and convert if necessary + in0_layout = model.get_tensor_layout(in0) + in1_layout = model.get_tensor_layout(in1) + result_layout = model.get_tensor_layout(result) + + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) + node_ind += 1 + in0_shape = model.get_tensor_shape(in0) + + if in1_layout == DataLayout.NCHW: + in1 = nchw_to_nhwc(in1, model, node_ind) + node_ind += 1 + in1_shape = model.get_tensor_shape(in1) + + # keep track of where we need to insert the HW Op + # it has to be ahead of the output transform + insert_point = node_ind + + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) + node_ind += 1 + + # now safe to assume num_channels is size of last dimension + num_channels = int(in0_shape[-1]) + # create node with no parallelization first + pe = 1 + + # create and insert new Eltwise node + new_node = helper.make_node( + "StreamingEltwise", + [in0, in1], + [result], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=num_channels, + PE=pe, + inputDataType0=idt0.name, + inputDataType1=idt1.name, + eltwiseOp=eltwiseOp, + numInputVectors=in0_shape[:-1], + name="StreamingEltwise_" + node.name, + ) + graph.node.insert(insert_point, new_node) + # remove old nodes + for nd in nodes_to_remove: + graph.node.remove(nd) + graph_modified = True + + return (model, graph_modified) diff --git a/tests/fpgadataflow/test_fpgadataflow_eltwise.py b/tests/fpgadataflow/test_fpgadataflow_eltwise.py index 6028a9b9f0..fbfcc8e28b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_eltwise.py +++ b/tests/fpgadataflow/test_fpgadataflow_eltwise.py @@ -1,4 +1,5 @@ # Copyright (c) 2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -38,7 +39,7 @@ from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.basic import gen_finn_dt_tensor -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim @@ -47,6 +48,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def build_model(shp, dt0, dt1, do_abs): @@ -105,9 +107,17 @@ def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode): in1 = gen_finn_dt_tensor(dt1, shp) idict = {"in0": in0, "in1": in1} y_expected = execute_onnx(model, idict)["out0"] - model = model.transform(to_hls.InferStreamingEltwise()) + model = model.transform(to_hw.InferStreamingEltwise()) assert len(model.graph.node) == 1 assert model.graph.node[0].op_type == "StreamingEltwise" + + y_produced = execute_onnx(model, idict)["out0"] + assert (y_produced == y_expected).all(), exec_mode + " failed" + + model = model.transform(SpecializeLayers()) + + assert len(model.graph.node) == 1 + assert model.graph.node[0].op_type == "StreamingEltwise_hls" getCustomOp(model.graph.node[0]).set_nodeattr("PE", pe) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -124,7 +134,7 @@ def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode): y_produced = execute_onnx(model, idict)["out0"] assert (y_produced == y_expected).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("StreamingEltwise")[0] + node = model.get_nodes_by_op_type("StreamingEltwise_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) From 1b82867313f3ecd16bff383c2f322c18cfda64bb Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 1 Dec 2023 16:38:58 +0000 Subject: [PATCH 025/291] [CustomOp] Cleanup after replacement of custom ops --- src/finn/custom_op/fpgadataflow/__init__.py | 18 - .../fpgadataflow/addstreams_batch.py | 392 ----------- .../fpgadataflow/channelwise_op_batch.py | 613 ------------------ .../fpgadataflow/duplicatestreams_batch.py | 429 ------------ src/finn/custom_op/fpgadataflow/eltwise.py | 484 -------------- .../custom_op/fpgadataflow/fmpadding_batch.py | 407 ------------ .../custom_op/fpgadataflow/fmpadding_rtl.py | 414 ------------ .../fpgadataflow/globalaccpool_batch.py | 352 ---------- .../fpgadataflow/labelselect_batch.py | 369 ----------- .../fpgadataflow/streamingmaxpool_batch.py | 441 ------------- 10 files changed, 3919 deletions(-) delete mode 100644 src/finn/custom_op/fpgadataflow/addstreams_batch.py delete mode 100644 src/finn/custom_op/fpgadataflow/channelwise_op_batch.py delete mode 100644 src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py delete mode 100644 src/finn/custom_op/fpgadataflow/eltwise.py delete mode 100644 src/finn/custom_op/fpgadataflow/fmpadding_batch.py delete mode 100644 src/finn/custom_op/fpgadataflow/fmpadding_rtl.py delete mode 100644 src/finn/custom_op/fpgadataflow/globalaccpool_batch.py delete mode 100644 src/finn/custom_op/fpgadataflow/labelselect_batch.py delete mode 100755 src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 6fe7993643..249716ce29 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -28,9 +28,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from finn.custom_op.fpgadataflow.addstreams import AddStreams -from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp -from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch from finn.custom_op.fpgadataflow.checksum import CheckSum from finn.custom_op.fpgadataflow.concat import StreamingConcat from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( @@ -44,15 +42,10 @@ ) from finn.custom_op.fpgadataflow.downsampler import DownSampler from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams -from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch from finn.custom_op.fpgadataflow.fmpadding import FMPadding -from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch -from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool -from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch from finn.custom_op.fpgadataflow.iodma import IODMA from finn.custom_op.fpgadataflow.labelselect import LabelSelect -from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch from finn.custom_op.fpgadataflow.lookup import Lookup from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch @@ -68,11 +61,9 @@ from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool -from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour -from finn.custom_op.fpgadataflow.upsampler_batch import UpsampleNearestNeighbour_Batch from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation custom_op = dict() @@ -80,7 +71,6 @@ # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure custom_op["DownSampler"] = DownSampler -custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch custom_op["MatrixVectorActivation"] = MatrixVectorActivation custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D @@ -89,22 +79,14 @@ custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl custom_op["StreamingFIFO"] = StreamingFIFO -custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch custom_op["Pool_Batch"] = Pool_Batch -custom_op["FMPadding_Batch"] = FMPadding_Batch custom_op["Thresholding_Batch"] = Thresholding_Batch -custom_op["AddStreams_Batch"] = AddStreams_Batch -custom_op["LabelSelect_Batch"] = LabelSelect_Batch -custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch custom_op["VectorVectorActivation"] = VectorVectorActivation -custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch custom_op["IODMA"] = IODMA custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition -custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch custom_op["Lookup"] = Lookup custom_op["StreamingConcat"] = StreamingConcat custom_op["CheckSum"] = CheckSum -custom_op["FMPadding_rtl"] = FMPadding_rtl custom_op["FMPadding"] = FMPadding custom_op["AddStreams"] = AddStreams diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/addstreams_batch.py deleted file mode 100644 index 51de1590ec..0000000000 --- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py +++ /dev/null @@ -1,392 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - - -class AddStreams_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib AddStreams_Batch function.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = super().get_nodeattr_types() - my_attrs.update( - { - "NumChannels": ("i", True, ""), - "PE": ("i", True, ""), - # FINN DataTypes for inputs; output datatype inferred from input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - "inFIFODepths": ("ints", False, [2, 2]), - } - ) - return my_attrs - - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich]) - return ishape - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - assert ich % pe == 0, "PE must divide NumChannels" - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich // pe, pe]) - return ishape - - def get_normal_output_shape(self, ind=0): - return self.get_normal_input_shape() - - def get_folded_output_shape(self, ind=0): - return self.get_folded_input_shape() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input1 shape." - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) - assert ishape == exp_ishape, "Unexpected input2 shape." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # enforce output data type (calculated based on idt) - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("NumChannels") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append("""The required LabelSelect_Batch attributes do not exist.""") - - return info_messages - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - # we need to set output datatype to the next larger int or uint - # enhancement: consider specifying w/ explicit outputDataType attribute - # to allow overflow and use the same idt if user wants - idt = DataType[self.get_nodeattr("inputDataType")] - if idt.signed(): - return DataType.get_smallest_possible(2 * idt.min()) - else: - return DataType.get_smallest_possible(2 * idt.max()) - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == exp_ishape, """Input0 shape doesn't match expected shape .""" - export_idt = self.get_input_datatype() - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - # exact same thing for input1 - inp = context[node.input[1]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == exp_ishape, """Input1 shape doesn't match expected shape .""" - export_idt = self.get_input_datatype() - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_1.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp0 = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - rtlsim_inp1 = npy_to_rtlsim_input( - "{}/input_1.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - self.code_gen_dict["$READNPYDATA$"] = [] - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - npy_in = "%s/input_1.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in1_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in1_{} ("in1_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - node = self.onnx_node - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<{}, {}, {}, {}, {}> (in0_{}, in1_{}, out_{}, 1);""".format( - node.op_type, - self.get_nodeattr("PE"), - self.get_input_datatype().get_hls_datatype_str(), - self.get_input_datatype().get_hls_datatype_str(), - self.get_output_datatype().get_hls_datatype_str(), - self.get_number_output_values(), - self.hls_sname(), - self.hls_sname(), - self.hls_sname(), - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, - hls::stream> &out_{})""".format( - self.onnx_node.name, - self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(), - self.hls_sname(), - self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(), - self.hls_sname(), - self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(), - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=in1_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - sname = self.hls_sname() - swidth = self.get_instream_width_padded() - intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] - return intf_names - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - "in1": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py deleted file mode 100644 index 5e0063ac33..0000000000 --- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py +++ /dev/null @@ -1,613 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from math import ceil -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - rtlsim_output_to_npy, -) - -# ONNX i/o tensor shape assumptions for channelwise ops: -# input 0 is the input tensor, shape (..., NumChannels) -# input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel) -# output 0 is the output tensor, shape (..., NumChannels) - same as input -# the ... here can be any shape (representing groups of vectors) - - -def get_smallest_possible(vals): - """Returns smallest (fewest bits) possible DataType that can represent - value. Prefers unsigned integers where possible.""" - vals = np.array(vals, dtype=np.float64) - for v in vals: - assert int(v) == v, "Error float value" - - for k in DataType.get_accumulator_dt_cands(): - dt = DataType[k] - - if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: - # not currently supported - continue - - if (dt.min() <= vals).all() and (vals <= dt.max()).all(): - return dt - - warnings.warn( - """InferChannelwiseLinearLayer: Output values may not be - representable with supported data types. - Setting maximum width data type available. - This will lead to errors if there are no constrains on the input - """ - ) - - if (0 <= vals).all(): - return DataType["UINT64"] - else: - return DataType["INT64"] - - -class ChannelwiseOp_Batch(HLSCustomOp): - """Class that corresponds to finn-hls Thresholding_Batch function. - It can implement a variety of channel-wise parametrized operations, - including Add, Mul and multi-thresholding. - """ - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = { - # channelwise "map" function to apply: - # one of cmp_le, cmp_ge, add, mul - "Func": ("s", False, "cmp_le", {"cmp_le", "cmp_ge", "add", "mul"}), - "PE": ("i", True, 0), - "NumChannels": ("i", True, 0), - # string defining memory resource type for parameters - "ram_style": ("s", False, "distributed", {"distributed", "block"}), - # FINN DataTypes for inputs, weights, outputs - "inputDataType": ("s", True, ""), - "paramDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def calc_tmem(self): - """Calculates and returns TMEM, the depth of the memory used - to store the channelwise op parameters.""" - chn = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - return chn // pe - - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - # implement tensor with correct shape - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # check input datatype against property - idt = model.get_tensor_datatype(node.input[0]) - - exp_idt_name = self.get_nodeattr("inputDataType") - if exp_idt_name != idt.name: - func = self.get_nodeattr("Func") - assert func in ["add", "mul"], "Bad input DataType for ChannelwiseOp layer" - - self.set_nodeattr("inputDataType", idt.name) - # update the func in ['add','mul'] cases - - # get parameter ranges - param = model.get_initializer(node.input[1]) - param_min = min(param.flatten()) - param_max = max(param.flatten()) - - # set function and determine output data type - if func == "add": - out_min = idt.min() + param_min - out_max = idt.max() + param_max - odt = get_smallest_possible([out_min, out_max]) - elif func == "mul": - possible_limits = [] - possible_limits += [idt.min() * param_min] - possible_limits += [idt.min() * param_max] - possible_limits += [idt.max() * param_min] - possible_limits += [idt.max() * param_max] - odt = get_smallest_possible(possible_limits) - - self.set_nodeattr("outputDataType", odt.name) - - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - # TODO collect automatically from get_nodeattr_types - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("NumChannels") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType") - self.get_nodeattr("paramDataType") - self.get_nodeattr("outputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append("""The required Threshold_Batch attributes do not exist.""") - - return info_messages - - def bram_estimation(self): - """Calculates BRAM cost if resource set to BRAM""" - style = self.get_nodeattr("ram_style") - P = self.get_nodeattr("PE") - idt = self.get_input_datatype() - A = idt.bitwidth() - tmem = self.calc_tmem() - - if style == "block" and tmem > 1: - return int(ceil(A * P / 16)) * int(ceil(tmem / 1024)) - else: - return 0 - - def lut_estimation(self): - """Calculates LUT cost, taking memory resource type into account""" - # TODO add in/out FIFO contributions - style = self.get_nodeattr("ram_style") - P = self.get_nodeattr("PE") - idt = self.get_input_datatype() - A = idt.bitwidth() - tmem = self.calc_tmem() - # cost of comparators - comparator_cost = A * P - # cost of LUTRAM - if style == "distributed" and tmem > 1: - lutram_cost = P * A * int(ceil(tmem / 64)) - else: - lutram_cost = 0 - # total cost - return comparator_cost + lutram_cost - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - return i_bits * self.get_nodeattr("PE") - - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - return o_bits * self.get_nodeattr("PE") - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - fold = ich // pe - vecs = list(self.get_nodeattr("numInputVectors")) - folded_input_shape = tuple(vecs + [fold, pe]) - return folded_input_shape - - def get_folded_output_shape(self, ind=0): - # same shape as input - return self.get_folded_input_shape() - - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_input_shape = tuple(vecs + [ich]) - return normal_input_shape - - def get_normal_output_shape(self, ind=0): - # same shape as input - return self.get_normal_input_shape() - - def get_number_output_values(self): - nf = np.prod(self.get_folded_output_shape()[:-1]) - return nf - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - # fill in TSrcI - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str - - return ret - - def get_hls_compatible_parameter_tensor(self, orig_param_vector): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure chn % PE == 0 - * interleave rows between PEs - * reshape into (PE, TMEM) and return - """ - chn = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - tmem = chn // pe - assert chn % pe == 0, "Requirement NumChannels divisable by PE is violated." - assert ( - orig_param_vector.ndim == 1 - ), """Parameter vector dimension is {}. - Expected dimension: 1.""".format( - orig_param_vector.ndim - ) - - # if not self.get_input_datatype().signed(): - # # ensure all thresholds are nonnegative - # assert (orig_param_vector >= 0).all() - - # ensure all thresholds are integer - assert (orig_param_vector.astype(np.int32) == orig_param_vector).all() - ret = orig_param_vector - - assert ret.shape[0] == chn, "Cardinality of parameter vector is not as expected (chn)" - - # distribute rows between PEs - ret = ret.reshape(tmem, pe).transpose() - assert ( - ret.shape[0] == pe - ), """First dimension after distribution of the - rows between PEs is not as expected (pe)""" - assert ( - ret.shape[1] == tmem - ), """Second dimension after distribution of the - rows between PEs is not as expected (tmem)""" - - return ret.reshape(1, pe, tmem) - - def generate_params(self, model, path): - code_gen_dir = path - # save thresholds in params.h - parameters = model.get_initializer(self.onnx_node.input[1]) - parameter_tensor = self.get_hls_compatible_parameter_tensor(parameters) - pdt = DataType[self.get_nodeattr("paramDataType")] - - parameters_hls_code = numpy_to_hls_code(parameter_tensor, pdt, "parameters", False, True) - # get input data type - export_idt = self.get_input_datatype() - if self.get_input_datatype() == DataType["BIPOLAR"]: - export_idt = DataType["BINARY"] - idt_hls = export_idt.get_hls_datatype_str() - - # write parameters into params.h - f_params = open("{}/params.h".format(code_gen_dir), "w") - pdt_hls = pdt.get_hls_datatype_str() - # use binary to export bipolar activations - export_odt = self.get_output_datatype() - if self.get_output_datatype() == DataType["BIPOLAR"]: - export_odt = DataType["BINARY"] - odt_hls = export_odt.get_hls_datatype_str() - # get desired function - func = self.get_nodeattr("Func") - if func == "cmp_le": - func_str = "comp::less_equal<%s, %s>" % (idt_hls, pdt_hls) - elif func == "cmp_ge": - func_str = "comp::greater_equal<%s, %s>" % (idt_hls, pdt_hls) - elif func == "add": - func_str = "comp::add<%s, %s, %s>" % (odt_hls, odt_hls, odt_hls) - elif func == "mul": - func_str = "comp::mul<%s, %s, %s>" % (odt_hls, odt_hls, odt_hls) - else: - raise Exception( - """Invalid value for attribute Func! Is currently set to: {} - has to be set to one of the following value - ("cmp_le", "cmp_ge", "add", "mul")""".format( - func - ) - ) - f_params.write( - "static ChannelWiseOperation<{},{},{},{},{},{}> threshs \ - = ".format( - self.calc_tmem(), - self.get_nodeattr("PE"), - idt_hls, - pdt_hls, - odt_hls, - func_str, - ) - ) - f_params.write(parameters_hls_code) - f_params.close() - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for ChannelwiseOp_Batch") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - # reinterpret binary output as bipolar where needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == self.get_normal_output_shape() - ), """Output shape is not as expected""" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"'] - - # TODO check and add whatever missing - def defines(self, var): - numInputVectors = list(self.get_nodeattr("numInputVectors")) - numReps = numInputVectors[0] - self.code_gen_dict["$DEFINES$"] = [ - """#define NumChannels1 {}\n#define PE1 {}\n#define numReps {}""".format( - self.get_nodeattr("NumChannels"), - self.get_nodeattr("PE"), - numReps, - ) - ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - tmpl_args = self.get_template_param_values() - # TODO: why put some template parameters into defines and not others? - # should ImgDim be defined or just filled in here like we do now? - ishape = self.get_folded_input_shape() - if len(ishape) == 3: - spatial_dim = 1 - elif len(ishape) == 5: - spatial_dim = ishape[1] * ishape[2] - else: - raise Exception("""Unexpeted input shape""") - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}> - (in0_{}, out_{}, threshs, numReps);""".format( - spatial_dim, - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - self.hls_sname(), - self.hls_sname(), - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - # the channelwise parameter tensor is acc_type [PE][TMEM][N_PARAMS_PER_CHANNEL] - # partition for parallel access along PE and N_PARAMS_PER_CHANNEL - # dimensions (dims 1 and 3) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.parameters " "complete dim=1") - ) - # self.code_gen_dict["$PRAGMAS$"].append( - # ( - # "#pragma HLS ARRAY_PARTITION variable=threshs.parameters " - # "complete dim=3" - # ) - # ) - - # set resource type - ram_style = self.get_nodeattr("ram_style") - pe = self.get_nodeattr("PE") - ich = self.get_nodeattr("NumChannels") - # if PE less than NumChannels, assign cores according to ram_style; - # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs - if pe < ich: - if ram_style == "distributed": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.parameters " "core=ROM_2P_LUTRAM") - ) - elif ram_style == "block": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.parameters " "core=ROM_2P_BRAM") - ) - else: - raise Exception( - """Invalid value for attribute ram_style! Is currently set to: {} - has to be set to one of ("block", "distributed")""".format( - ram_style - ) - ) diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py deleted file mode 100644 index 1f2d1b79be..0000000000 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py +++ /dev/null @@ -1,429 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - - -class DuplicateStreams_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib function of the same name.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = { - "NumChannels": ("i", True, 0), - "PE": ("i", True, 0), - # how many duplicated output streams to create - "NumOutputStreams": ("i", True, 0), - # FINN DataTypes for input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_num_output_streams(self): - return self.get_nodeattr("NumOutputStreams") - - def get_normal_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ch]) - return ishape - - def get_folded_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) - assert ch % pe == 0, "PE must divide NumChannels" - folds = int(ch / pe) - folded_ishape = tuple(vecs + [folds, pe]) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - # since the output shape of both out streams are the same - # return independently from index - return self.get_normal_input_shape() - - def get_folded_output_shape(self, ind=0): - # since the output shape of both out streams are the same - # return independently from index - return self.get_folded_input_shape() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape." - num_out = self.get_num_output_streams() - assert len(self.onnx_node.output) == num_out, "Unexpected number of outputs" - - oshape = self.get_normal_output_shape() - ret = super().make_const_shape_op(oshape) - ret.output[:] = self.onnx_node.output - return ret - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - odt = self.get_output_datatype() - for my_out in self.onnx_node.output: - model.set_tensor_datatype(my_out, odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("NumChannels") - self.get_nodeattr("PE") - self.get_nodeattr("NumOutputStreams") - self.get_nodeattr("inputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""") - - return info_messages - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return self.get_num_output_streams() * np.prod(self.get_folded_output_shape()[1:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - - def generate_params(self, model, path): - n_outputs = self.get_num_output_streams() - inp_streams = [] - commands = [] - o_stream_w = self.get_outstream_width() - i_stream_w = self.get_instream_width() - in_stream = "hls::stream > &in0" % (i_stream_w) - inp_streams.append(in_stream) - commands.append("ap_uint<%d> e = in0.read();" % i_stream_w) - iters = self.get_number_output_values() // self.get_num_output_streams() - for i in range(n_outputs): - out_stream = "hls::stream > &out%d" % (o_stream_w, i) - inp_streams.append(out_stream) - cmd = "out%d.write(e);" % i - commands.append(cmd) - - impl_hls_code = [] - impl_hls_code.append("void DuplicateStreamsCustom(") - impl_hls_code.append(",".join(inp_streams)) - impl_hls_code.append(") {") - impl_hls_code.append("for(unsigned int i = 0; i < %d; i++) {" % iters) - impl_hls_code.append("#pragma HLS PIPELINE II=1") - impl_hls_code.append("\n".join(commands)) - impl_hls_code.append("}") - impl_hls_code.append("}") - impl_hls_code = "\n".join(impl_hls_code) - - impl_filename = "{}/duplicate_impl.hpp".format(path) - f_impl = open(impl_filename, "w") - f_impl.write(impl_hls_code) - f_impl.close() - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - n_outputs = self.get_num_output_streams() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == exp_ishape, """Input shape doesn't match expected shape .""" - export_idt = self.get_input_datatype() - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_outputs(context, ["output%d.npy" % i for i in range(n_outputs)]) - for i in range(n_outputs): - assert ( - context[node.output[i]].shape == exp_oshape - ), "cppsim \ - did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_dict = { - "inputs": {"in0": rtlsim_inp}, - "outputs": {}, - } - for i in range(n_outputs): - rtlsim_dict["outputs"]["out%d" % i] = [] - self.rtlsim_multi_io(sim, rtlsim_dict) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_shape = self.get_folded_output_shape() - for i in range(n_outputs): - out_npy_path = "%s/output%d.npy" % (code_gen_dir, i) - rtlsim_output_to_npy( - rtlsim_dict["outputs"]["out%d" % i], - out_npy_path, - odt, - out_shape, - packed_bits, - target_bits, - ) - # load and reshape output 0 - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[i]] = output - - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output0 shape doesn't match expected shape.""" - assert ( - context[node.output[1]].shape == exp_oshape - ), """Output1 shape doesn't match expected shape.""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "duplicate_impl.hpp"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - n_outputs = self.get_num_output_streams() - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - for i in range(n_outputs): - out_name = "out%d_%s" % (i, self.hls_sname()) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> %s ("%s");' - % (self.get_outstream_width(), out_name, out_name) - ) - - def docompute(self): - n_outputs = self.get_num_output_streams() - ostreams = [] - for i in range(n_outputs): - ostreams.append("out%d_%s" % (i, self.hls_sname())) - dc = "DuplicateStreamsCustom(in0_%s, %s);" % ( - self.hls_sname(), - ",".join(ostreams), - ) - self.code_gen_dict["$DOCOMPUTE$"] = [dc] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - n_outputs = self.get_num_output_streams() - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - outstrm_code = [] - - for i in range(n_outputs): - out_name = "out%d_%s" % (i, self.hls_sname()) - npy_out = "%s/output%d.npy" % (code_gen_dir, i) - outstrm_code.append( - 'apintstream2npy<%s, %s, %d, %s>(%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - out_name, - oshape_cpp_str, - npy_out, - ) - ) - - self.code_gen_dict["$DATAOUTSTREAM$"] = outstrm_code - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - n_outputs = self.get_num_output_streams() - inp_streams = [] - o_stream_w = self.get_outstream_width() - i_stream_w = self.get_instream_width() - in_stream = "hls::stream > &in0_%s" % (i_stream_w, self.hls_sname()) - inp_streams.append(in_stream) - for i in range(n_outputs): - out_stream = "hls::stream > &out%d_%s" % ( - o_stream_w, - i, - self.hls_sname(), - ) - inp_streams.append(out_stream) - - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}({})""".format( - self.onnx_node.name, - ",".join(inp_streams), - ) - ] - - def pragmas(self): - n_outputs = self.get_num_output_streams() - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - for i in range(n_outputs): - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out%d_%s" % (i, self.hls_sname()) - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - n_outputs = self.get_num_output_streams() - sname = self.hls_sname() - intf_names["m_axis"] = [] - for i in range(n_outputs): - intf_names["m_axis"].append( - ("out%d_%s" % (i, sname), self.get_outstream_width_padded()) - ) - return intf_names - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out0": [], "out1": []}, - } - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/eltwise.py deleted file mode 100644 index ab1dc00118..0000000000 --- a/src/finn/custom_op/fpgadataflow/eltwise.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright (c) 2022, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - - -class StreamingEltwise(HLSCustomOp): - """Class that corresponds to finn-hlslib StreamingEltwise function.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = super().get_nodeattr_types() - my_attrs.update( - { - "NumChannels": ("i", True, ""), - "PE": ("i", True, ""), - # FINN DataTypes for inputs; output datatype inferred from input - "inputDataType0": ("s", True, ""), - "inputDataType1": ("s", True, ""), - # type of EltwiseFunction for the operation - "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - "inFIFODepths": ("ints", False, [2, 2]), - } - ) - return my_attrs - - def get_eltwise_op_lambda(self): - eltwise_op = self.get_nodeattr("eltwiseOp") - idt0 = self.get_input_datatype(0) - idt1 = self.get_input_datatype(1) - odt = self.get_output_datatype() - tin0 = idt0.get_hls_datatype_str() - tin1 = idt1.get_hls_datatype_str() - tout = odt.get_hls_datatype_str() - eltwise_ops = { - # "Add": "[](auto a, auto b) { return a + b; }", - # "Sub": "[](auto a, auto b) { return a - b; }", - # "AbsDiff": "[](auto a, auto b) { return a>b? a-b : b-a; }", - "Add": f"add<{tin0}, {tin1}, {tout}>()", - "Sub": f"sub<{tin0}, {tin1}, {tout}>()", - "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()", - } - return eltwise_ops[eltwise_op] - - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich]) - return ishape - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - assert ich % pe == 0, "PE must divide NumChannels" - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich // pe, pe]) - return ishape - - def get_normal_output_shape(self, ind=0): - return self.get_normal_input_shape() - - def get_folded_output_shape(self, ind=0): - return self.get_folded_input_shape() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input1 shape." - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) - assert ishape == exp_ishape, "Unexpected input2 shape." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt0 = model.get_tensor_datatype(node.input[0]) - if idt0 != self.get_input_datatype(0): - warn_str = "inputDataType0 changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype(0)), - str(idt0), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType0", idt0.name) - idt1 = model.get_tensor_datatype(node.input[1]) - if idt1 != self.get_input_datatype(1): - warn_str = "inputDataType1 changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype(1)), - str(idt1), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType1", idt1.name) - # enforce output data type (calculated based on idt) - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("NumChannels") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType0") - self.get_nodeattr("inputDataType1") - self.get_nodeattr("eltwiseOp") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append("""The required StreamingEltwise attributes do not exist.""") - - return info_messages - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType" + str(ind))] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - op = self.get_nodeattr("eltwiseOp") - idt0 = self.get_input_datatype(0) - idt1 = self.get_input_datatype(1) - assert idt0.signed() == idt1.signed(), ( - "%s: Inputs must have same signedness" % self.onnx_node.name - ) - idt0_min, idt0_max = idt0.min(), idt0.max() - idt1_min, idt1_max = idt1.min(), idt1.max() - cands = [ - idt0_min - idt1_min, - idt0_min - idt1_max, - idt0_max - idt1_min, - idt0_max - idt1_max, - ] - largest_magnitude = max(map(abs, cands)) - if op == "Add": - if idt0.signed(): - return DataType.get_smallest_possible(idt0.min() + idt1.min()) - else: - return DataType.get_smallest_possible(idt0.max() + idt1.max()) - elif op == "Sub": - return DataType.get_smallest_possible(-largest_magnitude) - elif op == "AbsDiff": - return DataType.get_smallest_possible(largest_magnitude) - else: - raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op)) - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype(ind).bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == exp_ishape, """Input0 shape doesn't match expected shape .""" - export_idt0 = self.get_input_datatype(0) - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - # exact same thing for input1 - inp = context[node.input[1]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == exp_ishape, """Input1 shape doesn't match expected shape .""" - export_idt1 = self.get_input_datatype(1) - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_1.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits0 = self.get_instream_width(0) - nbits1 = self.get_instream_width(1) - rtlsim_inp0 = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt0, nbits0 - ) - rtlsim_inp1 = npy_to_rtlsim_input( - "{}/input_1.npy".format(code_gen_dir), export_idt1, nbits1 - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = [ - '#include "eltwise.hpp"', - '#include "interpret.hpp"', - ] - - self.code_gen_dict["$GLOBALS$"].extend( - [ - "template", - "struct absdiff {", - "TO operator()(TI1 const &a, TI2 const &b) const {", - "#pragma HLS inline", - "return a>b? a-b : b-a;", - "}", - "};", - "template", - "struct sub {", - "TO operator()(TI1 const &a, TI2 const &b) const {", - "#pragma HLS inline", - "return a-b;", - "}", - "};", - "template", - "struct add {", - "TO operator()(TI1 const &a, TI2 const &b) const {", - "#pragma HLS inline", - "return a+b;", - "}", - "};", - ] - ) - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - idt0 = self.get_input_datatype(0) - idt1 = self.get_input_datatype(1) - elem_bits_0 = idt0.bitwidth() - elem_bits_1 = idt1.bitwidth() - packed_bits_0 = self.get_instream_width(0) - packed_hls_type_0 = "ap_uint<%d>" % packed_bits_0 - packed_bits_1 = self.get_instream_width(1) - packed_hls_type_1 = "ap_uint<%d>" % packed_bits_1 - elem_hls_type_0 = idt0.get_hls_datatype_str() - elem_hls_type_1 = idt1.get_hls_datatype_str() - npy_type = "float" - self.code_gen_dict["$READNPYDATA$"] = [] - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type_0, - elem_hls_type_0, - elem_bits_0, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - npy_in = "%s/input_1.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in1_%s);' - % ( - packed_hls_type_1, - elem_hls_type_1, - elem_bits_1, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(0), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in1_{} ("in1_{}");'.format( - self.get_instream_width(1), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - op = self.get_nodeattr("eltwiseOp") - idt0 = self.get_input_datatype(0) - idt1 = self.get_input_datatype(1) - odt = self.get_output_datatype() - elem_hls_type_0 = idt0.get_hls_datatype_str() - elem_hls_type_1 = idt1.get_hls_datatype_str() - out_hls_type = odt.get_hls_datatype_str() - slice_in0 = "Slice<%s>" % elem_hls_type_0 - slice_in1 = "Slice<%s>" % elem_hls_type_1 - slice_out = "Slice<%s>" % out_hls_type - eltwise_op_str = self.get_eltwise_op_lambda() - "%sEltwiseFunction<%s, %s, %s>()" % ( - op, - elem_hls_type_0, - elem_hls_type_1, - out_hls_type, - ) - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<{}, {}, {}, {}, {}, {}>(in0_{}, in1_{}, out_{}, {});""".format( - "StreamingEltwise", - self.get_nodeattr("NumChannels"), - self.get_nodeattr("PE"), - int(np.prod(self.get_folded_output_shape()[:-2])), - slice_in0, - slice_in1, - slice_out, - self.hls_sname(), - self.hls_sname(), - self.hls_sname(), - eltwise_op_str, - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, - hls::stream> &out_{})""".format( - self.onnx_node.name, - self.get_nodeattr("PE") * self.get_input_datatype(0).bitwidth(), - self.hls_sname(), - self.get_nodeattr("PE") * self.get_input_datatype(1).bitwidth(), - self.hls_sname(), - self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(), - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=in1_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - sname = self.hls_sname() - swidth = self.get_instream_width_padded() - intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] - return intf_names diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py deleted file mode 100644 index 5bd5e07916..0000000000 --- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py +++ /dev/null @@ -1,407 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - - -class FMPadding_Batch(HLSCustomOp): - """Corresponds to finn-hlslib FMPadding_Batch function. - Pads input image by given amount.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = { - # spatial size of input images - "ImgDim": ("ints", True, []), # [H, W] = [Y, X] - # total padding (per dimension) to apply - "Padding": ( - "ints", - True, - [1, 1, 1, 1], - ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end] - # number of channels in input image - "NumChannels": ("i", True, 0), - # SIMD Input parallelism - "SIMD": ("i", False, 1), - # FINN input datatype - "inputDataType": ("s", True, ""), - # shape describing input vecs per execution - "numInputVectors": ("i", False, 1), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_padded_odim(self): - "Return the padded spatial size of the output." - idim_h, idim_w = self.get_nodeattr("ImgDim") - pad = self.get_nodeattr("Padding") - pad_h = pad[0] + pad[2] - pad_w = pad[1] + pad[3] - odim_h = idim_h + pad_h - odim_w = idim_w + pad_w - return [odim_h, odim_w] - - def get_exp_cycles(self): - odim_h, odim_w = self.get_padded_odim() - channels = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - batch_size = self.get_nodeattr("numInputVectors") - exp_cycles = (channels / simd) * batch_size * odim_h * odim_w - return int(exp_cycles) - - def get_normal_input_shape(self, ind=0): - idim_h, idim_w = self.get_nodeattr("ImgDim") - num_ch = self.get_nodeattr("NumChannels") - ishape = (1, idim_h, idim_w, num_ch) - return ishape - - def get_normal_output_shape(self, ind=0): - odim_h, odim_w = self.get_padded_odim() - num_ch = self.get_nodeattr("NumChannels") - - oshape = (1, odim_h, odim_w, num_ch) - return oshape - - def get_folded_input_shape(self, ind=0): - normal_ishape = list(self.get_normal_input_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_ishape[-1] / simd) - folded_ishape = normal_ishape[:-1] + [fold, simd] - return tuple(folded_ishape) - - def get_folded_output_shape(self, ind=0): - normal_oshape = list(self.get_normal_output_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_oshape[-1] / simd) - folded_oshape = normal_oshape[:-1] + [fold, simd] - return tuple(folded_oshape) - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for SameResize." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - ret = DataType[self.get_nodeattr("inputDataType")] - # the hlslib op always pads with zeros, so ensure that the DataType - # is able to represent zeros - assert ret.allowed(0), "FMPadding_Batch DataType must support zero" - return ret - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output. (Same as input datatype)""" - return self.get_input_datatype() - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return ibits * simd - - def get_outstream_width(self, ind=0): - obits = self.get_output_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return obits * simd - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] - - def defines(self, var): - idim_h, idim_w = self.get_nodeattr("ImgDim") - odim_h, odim_w = self.get_padded_odim() - pad = self.get_nodeattr("Padding") - pad_h = pad[0] + pad[2] - pad_w = pad[1] + pad[3] - is_square_img = idim_h == idim_w - is_square_pad = pad_h == pad_w - - if is_square_img and is_square_pad: - self.code_gen_dict["$DEFINES$"] = [ - """#define ImgDim1 {}\n#define OutputDim1 {}\n - #define PaddingBefore1 {}\n#define PaddingBehind1 {}\n - #define NumChannels1 {}\n#define SIMD1 {}\n - #define numReps {}\n""".format( - idim_h, - odim_h, - pad[0], - pad[2], - self.get_nodeattr("NumChannels"), - self.get_nodeattr("SIMD"), - self.get_nodeattr("numInputVectors"), - ) - ] - else: - self.code_gen_dict["$DEFINES$"] = [ - """ - #define OutputDim1_x {}\n - #define OutputDim1_y {}\n - #define PaddingLeft1 {}\n - #define PaddingRight1 {}\n - #define PaddingTop1 {}\n - #define PaddingBottom1 {}\n - #define NumChannels1 {}\n - #define SIMD1 {}\n - #define numReps {}\n - """.format( - odim_w, - odim_h, - pad[1], - pad[3], - pad[0], - pad[2], - self.get_nodeattr("NumChannels"), - self.get_nodeattr("SIMD"), - self.get_nodeattr("numInputVectors"), - ) - ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - in_t = self.get_input_datatype().get_hls_datatype_str() - node = self.onnx_node - - idim_h, idim_w = self.get_nodeattr("ImgDim") - pad = self.get_nodeattr("Padding") - pad_h = pad[0] + pad[2] - pad_w = pad[1] + pad[3] - is_square_img = idim_h == idim_w - is_square_pad = pad_h == pad_w - - if is_square_img and is_square_pad: - hls_call = node.op_type - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} (in0_{}, out_{}, numReps);""".format( - hls_call, in_t, self.hls_sname(), self.hls_sname() - ) - ] - else: - hls_call = "FMPadding_nonsquare_Batch" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} (in0_{}, out_{}, numReps);""".format( - hls_call, in_t, self.hls_sname(), self.hls_sname() - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - packed_hls_type, - self.hls_sname(), - packed_hls_type, - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim_H, OutputDim_W, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py deleted file mode 100644 index d79c214730..0000000000 --- a/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py +++ /dev/null @@ -1,414 +0,0 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import math -import numpy as np -import os -import shutil -import warnings -from qonnx.core.datatype import DataType -from qonnx.util.basic import roundup_to_integer_multiple - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - -try: - from pyverilator import PyVerilator -except ModuleNotFoundError: - PyVerilator = None - - -class FMPadding_rtl(HLSCustomOp): - """CustomOp wrapper for the finn-rtllib fmpadding_axi component - Supports adjusting the padding amount and spatial feature sizes at - runtime.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = { - # spatial size of input images - "ImgDim": ("ints", True, []), # [H, W] = [Y, X] - # total padding (per dimension) to apply - "Padding": ( - "ints", - True, - [1, 1, 1, 1], - ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end] - # number of channels in input image - "NumChannels": ("i", True, 0), - # SIMD Input parallelism - "SIMD": ("i", False, 1), - # FINN input datatype - "inputDataType": ("s", True, ""), - # shape describing input vecs per execution - "numInputVectors": ("i", False, 1), - # Enable reprogrammable implementation to change FM dimensions, - # stride, or dilation during runtime - "dynamic_mode": ("i", False, 0, {0, 1}), - # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, ""), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_padded_odim(self): - "Return the padded spatial size of the output." - idim_h, idim_w = self.get_nodeattr("ImgDim") - pad = self.get_nodeattr("Padding") - pad_h = pad[0] + pad[2] - pad_w = pad[1] + pad[3] - odim_h = idim_h + pad_h - odim_w = idim_w + pad_w - return [odim_h, odim_w] - - def get_exp_cycles(self): - odim_h, odim_w = self.get_padded_odim() - channels = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - batch_size = self.get_nodeattr("numInputVectors") - exp_cycles = (channels / simd) * batch_size * odim_h * odim_w - return int(exp_cycles) - - def get_normal_input_shape(self, ind=0): - idim_h, idim_w = self.get_nodeattr("ImgDim") - num_ch = self.get_nodeattr("NumChannels") - ishape = (1, idim_h, idim_w, num_ch) - return ishape - - def get_normal_output_shape(self, ind=0): - odim_h, odim_w = self.get_padded_odim() - num_ch = self.get_nodeattr("NumChannels") - - oshape = (1, odim_h, odim_w, num_ch) - return oshape - - def get_folded_input_shape(self, ind=0): - normal_ishape = list(self.get_normal_input_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_ishape[-1] / simd) - folded_ishape = normal_ishape[:-1] + [fold, simd] - return tuple(folded_ishape) - - def get_folded_output_shape(self, ind=0): - normal_oshape = list(self.get_normal_output_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_oshape[-1] / simd) - folded_oshape = normal_oshape[:-1] + [fold, simd] - return tuple(folded_oshape) - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape for FMPadding_rtl." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - ret = DataType[self.get_nodeattr("inputDataType")] - # the hlslib op always pads with zeros, so ensure that the DataType - # is able to represent zeros - assert ret.allowed(0), "FMPadding_rtl DataType must support zero" - return ret - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output. (Same as input datatype)""" - return self.get_input_datatype() - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return ibits * simd - - def get_outstream_width(self, ind=0): - obits = self.get_output_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return obits * simd - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def get_verilog_top_module_intf_names(self): - # Overload default HLSCustomOp implementation to add axilite control IF - intf_names = super().get_verilog_top_module_intf_names() - if self.get_nodeattr("dynamic_mode"): - intf_names["axilite"] = ["s_axilite"] - return intf_names - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - raise Exception("cppsim not possible for FMPadding_rtl, please set exec_mode to rtlsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim_H, OutputDim_W, NumChannels).""" - - def get_template_values(self, ifm_dims, pads, chans, simd, idt): - dimY, dimX = ifm_dims - padT, padL, padB, padR = pads - y_counter_bits = int(math.ceil(math.log2(padT + dimY + padB + 1))) - x_counter_bits = int(math.ceil(math.log2(padL + dimX + padR + 1))) - topname = self.get_verilog_top_module_name() - stream_bits = idt.bitwidth() * simd - stream_bits = int(roundup_to_integer_multiple(stream_bits, 8)) - code_gen_dict = { - "XCOUNTER_BITS": int(x_counter_bits), - "YCOUNTER_BITS": int(y_counter_bits), - "NUM_CHANNELS": int(chans), - "SIMD": int(simd), - "ELEM_BITS": idt.bitwidth(), - "TOP_MODULE_NAME": topname, - "INIT_XON": int(padL), - "INIT_XOFF": int(padL + dimX), - "INIT_XEND": int(padL + dimX + padR - 1), - "INIT_YON": int(padT), - "INIT_YOFF": int(padT + dimY), - "INIT_YEND": int(padT + dimY + padB - 1), - "STREAM_BITS": int(stream_bits), - } - return code_gen_dict - - def get_dynamic_config(self, ifm_dims=None, pads=None): - """Returns a configuration dict to re-configure FM dimension and - padding amounts during runtime.""" - - if ifm_dims is None: - ifm_dims = self.get_nodeattr("ImgDim") - if pads is None: - pads = self.get_nodeattr("Padding") - chans = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - idt = self.get_input_datatype() - code_gen_dict = self.get_template_values(ifm_dims, pads, chans, simd, idt) - config = { - "XON": (0 * 4, (code_gen_dict["INIT_XON"])), - "XOFF": (1 * 4, (code_gen_dict["INIT_XOFF"])), - "XEND": (2 * 4, (code_gen_dict["INIT_XEND"])), - "YON": (3 * 4, (code_gen_dict["INIT_YON"])), - "YOFF": (4 * 4, (code_gen_dict["INIT_YOFF"])), - "YEND": (5 * 4, (code_gen_dict["INIT_YEND"])), - } - return config - - def generate_hdl(self): - rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fmpadding/hdl" - template_path = rtlsrc + "/fmpadding_template.v" - dims = self.get_nodeattr("ImgDim") - pads = self.get_nodeattr("Padding") - chans = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - idt = self.get_input_datatype() - code_gen_dict = self.get_template_values(dims, pads, chans, simd, idt) - # save top module name so we can refer to it after this node has been renamed - # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) - self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) - - # apply code generation to templates - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - with open(template_path, "r") as f: - template = f.read() - for key_name in code_gen_dict: - key = "$%s$" % key_name - template = template.replace(key, str(code_gen_dict[key_name])) - - with open( - os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"), - "w", - ) as f: - f.write(template) - - sv_files = ["fmpadding_axi.sv", "fmpadding.sv", "axi2we.sv"] - for sv_file in sv_files: - shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir) - # set ipgen_path and ip_path so that HLS-Synth transformation - # and stich_ip transformation do not complain - self.set_nodeattr("ipgen_path", code_gen_dir) - self.set_nodeattr("ip_path", code_gen_dir) - - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - # Modified to use generated (System-)Verilog instead of HLS output products - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_paths = [code_gen_dir] - verilog_files = [ - "fmpadding_axi.sv", - "fmpadding.sv", - "axi2we.sv", - self.get_nodeattr("gen_top_module") + ".v", - ] - - # build the Verilator emu library - sim = PyVerilator.build( - verilog_files, - build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), - verilog_path=verilog_paths, - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim - - def code_generation_ipi(self): - """Constructs and returns the TCL for node instantiation in Vivado IPI.""" - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - - sourcefiles = [ - "fmpadding_axi.sv", - "fmpadding.sv", - "axi2we.sv", - self.get_nodeattr("gen_top_module") + ".v", - ] - - sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles] - - cmd = [] - for f in sourcefiles: - cmd += ["add_files -norecurse %s" % (f)] - cmd += [ - "create_bd_cell -type module -reference %s %s" - % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) - ] - return cmd - - def code_generation_ipgen(self, model, fpgapart, clk): - """Normally: Generates C++ code and tcl script for IP generation. - Here: Generates (System-)Verilog code for IP generation.""" - self.generate_hdl() - - def ipgen_singlenode_code(self): - """Normally: Builds the bash script for IP generation.""" - pass - - def code_generation_cppsim(self, model): - """Normally: Generates C++ code for simulation (cppsim).""" - pass - - def compile_singlenode_code(self): - pass - - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - - def pragmas(self): - pass diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py deleted file mode 100644 index 5ed440dace..0000000000 --- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py +++ /dev/null @@ -1,352 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - - -class GlobalAccPool_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib AccPool_Batch function.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = { - "NumChannels": ("i", True, 0), - "PE": ("i", True, 0), - # FINN DataTypes for input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_normal_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ch]) - return ishape - - def get_folded_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) - assert ch % pe == 0, "PE must divide NumChannels" - folds = int(ch / pe) - folded_ishape = tuple(vecs + [folds, pe]) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - if len(vecs) == 1: - oshape = tuple(vecs + [ch]) - elif len(vecs) == 3: - oshape = tuple([vecs[0]] + [1, 1, ch]) - return oshape - - def get_folded_output_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - unfolded_shape = list(self.get_normal_output_shape()) - assert ch % pe == 0, "PE must divide NumChannels" - folds = int(ch / pe) - oshape = tuple(unfolded_shape[:-1] + [folds, pe]) - return oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("NumChannels") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""") - - # verify that input data is 2D - if len(self.get_nodeattr("numInputVectors")) != 3: - info_messages.append("""GlobalAccPool_Batch requires 2D data input.""") - raise Exception - - return info_messages - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - # determine data type from image size and input type - idt = DataType[self.get_nodeattr("inputDataType")] - vecs = list(self.get_nodeattr("numInputVectors")) - npixels = vecs[-1] * vecs[-2] - if idt.signed(): - extreme_value = npixels * idt.min() - else: - extreme_value = npixels * idt.max() - return DataType.get_smallest_possible(extreme_value) - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[1:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * idim * idim + Channels/PE - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - folds = int(ch / pe) - return int(np.prod(self.get_folded_input_shape()[:-1]) + folds) - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == exp_ishape, """Input shape doesn't match expected shape .""" - export_idt = self.get_input_datatype() - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim \ - did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - self.code_gen_dict["$DOCOMPUTE$"] = [ - """AccPool_Batch<{}, {}, {}, {}, {}> (in0_{}, out_{}, 1);""".format( - self.get_normal_input_shape()[1], - self.get_nodeattr("NumChannels"), - self.get_input_datatype().get_hls_datatype_str(), - self.get_nodeattr("PE"), - self.get_output_datatype().get_hls_datatype_str(), - self.hls_sname(), - self.hls_sname(), - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{})""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/labelselect_batch.py deleted file mode 100644 index 60d3eb9154..0000000000 --- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py +++ /dev/null @@ -1,369 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.util.basic import roundup_to_integer_multiple - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - - -class LabelSelect_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib LabelSelect_Batch function.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - odt_name = self.get_nodeattr("outputDataType") - if odt_name == "": - # If not provided compute min size - labels = self.get_nodeattr("Labels") - odt = DataType.get_smallest_possible(labels - 1) - # ensure a datatype divisible by 8-bits in case this is the last node - bw = roundup_to_integer_multiple(odt.bitwidth(), 8) - new_odt_name = odt.name.replace(str(odt.bitwidth()), str(bw)) - odt = DataType[new_odt_name] - odt_name = odt.name - self.set_nodeattr("outputDataType", odt_name) - - def get_nodeattr_types(self): - my_attrs = { - "Labels": ("i", True, 0), - "PE": ("i", True, 0), - "K": ("i", True, 0), - # FINN DataTypes for input - "inputDataType": ("s", True, ""), - "outputDataType": ("s", False, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_normal_input_shape(self, ind=0): - nlabels = self.get_nodeattr("Labels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [nlabels]) - return ishape - - def get_folded_input_shape(self, ind=0): - nlabels = self.get_nodeattr("Labels") - pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) - assert nlabels % pe == 0, "PE must divide Labels" - folds = int(nlabels / pe) - folded_ishape = tuple(vecs + [folds, pe]) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - k = self.get_nodeattr("K") - vecs = list(self.get_nodeattr("numInputVectors")) - oshape = tuple(vecs + [k]) - return oshape - - def get_folded_output_shape(self, ind=0): - k = self.get_nodeattr("K") - vecs = list(self.get_nodeattr("numInputVectors")) - oshape = tuple(vecs + [k, 1]) - return oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape." - return helper.make_node( - "RandomNormal", - inputs=[], - outputs=[self.onnx_node.output[0]], - mean=0.0, - scale=1.0, - dtype=TensorProto.INT64, - shape=list(oshape), - ) - - def infer_node_datatype(self, model): - node = self.onnx_node - # check input datatype against property - idt = model.get_tensor_datatype(node.input[0]) - self.set_nodeattr("inputDataType", idt.name) - - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("Labels") - self.get_nodeattr("PE") - self.get_nodeattr("K") - self.get_nodeattr("inputDataType") - self.get_nodeattr("outputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append("""The required LabelSelect_Batch attributes do not exist.""") - - # verify that input data is 1D - if len(self.get_nodeattr("numInputVectors")) > 1: - info_messages.append("""LabelSelect_Batch requires 1D data input.""") - raise Exception - - return info_messages - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - ret = DataType[self.get_nodeattr("inputDataType")] - return ret - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - ret = DataType[self.get_nodeattr("outputDataType")] - return ret - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - return self.get_output_datatype().bitwidth() - - def get_number_output_values(self): - return self.get_nodeattr("K") - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == exp_ishape, """Input shape doesn't match expected shape .""" - export_idt = self.get_input_datatype() - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim \ - did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" - # TopK ind output normally uses TensorProto.INT64, which - # can cause issues for the node-by-node simulation in FINN - # (as the custom DataType system always assumes float containers) - # so cast the output to int64 - ret = context[node.output[0]] - context[node.output[0]] = ret.astype(np.int64) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - - # Calling npy2apintstream with reverse_inner = false to have LE packing - # as required by HLS fxn LabelSelect_Batch - # Also notice that StreamingDataWidthConverter_Batch performs LE packing - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - node = self.onnx_node - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<{}, {}, {}, {}, {} > (in0_{}, out_{}, 1);""".format( - node.op_type, - self.get_nodeattr("Labels"), - self.get_nodeattr("PE"), - self.get_nodeattr("K"), - self.get_input_datatype().get_hls_datatype_str(), - self.get_output_datatype().get_hls_datatype_str(), - self.hls_sname(), - self.hls_sname(), - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream > &out_{})""".format( - self.onnx_node.name, - self.get_nodeattr("PE"), - self.get_input_datatype().bitwidth(), - self.hls_sname(), - self.get_output_datatype().bitwidth(), - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def get_exp_cycles(self): - nlabels = self.get_nodeattr("Labels") - pe = self.get_nodeattr("PE") - exp_cycles = nlabels / pe - return int(exp_cycles) diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py deleted file mode 100755 index 8f294da4ac..0000000000 --- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py +++ /dev/null @@ -1,441 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType -from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - -# TODO: consider splitting this into separate implementations for 1D and 2D -# similar to what we do for ConvolutionInputGenerator - - -class StreamingMaxPool_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib StreamingMaxPool_batch function.""" - - def get_nodeattr_types(self): - my_attrs = { - "ImgDim": ("ints", True, []), # [H, W] = [Y, X] - "PoolDim": ("ints", True, []), # [H, W] = [Y, X] - "NumChannels": ("i", True, 0), - # parallelism control - only supported for 1D maxpool - "PE": ("i", False, 0), - # round up (instead of down) output size - only supported for 1D maxpool - "CeilMode": ("i", False, 0), - # FINN DataTypes for inputs/outputs - "dataType": ("s", True, ""), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("dataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("dataType")] - - def get_1d_attrs_normalized(self): - # support both (1, D) and (D, 1) cases transparently: - # assume the dummy ('1') dimension is the Y-dimension, i.e. - # images and kernels (and their attributes) of dimension - # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D] - ifm_dim = self.get_nodeattr("ImgDim") - k = self.get_nodeattr("PoolDim") - ifm_ch = self.get_nodeattr("NumChannels") - if ifm_dim[1] == 1: - ifm_dim = ifm_dim[::-1] - k = k[::-1] - return (ifm_dim, k, ifm_ch) - - def is_1d(self): - ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() - return (ifm_dim[0] == 1) and (k[0] == 1) - - def get_normal_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") - ifm_ch = self.get_nodeattr("NumChannels") - ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") - ifm_ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - nf = int(ifm_ch / pe) - if self.is_1d(): - folded_ishape = (1, ifm_dim_h, ifm_dim_w, nf, pe) - else: - folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") - k_h, k_w = tuple(self.get_nodeattr("PoolDim")) - ifm_ch = self.get_nodeattr("NumChannels") - ceil_mode = self.get_nodeattr("CeilMode") - if not self.is_1d(): - assert ifm_dim_h % k_h == 0, "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0" - assert ifm_dim_w % k_w == 0, "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0" - ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, k_h, 0, ceil_mode) - ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, k_w, 0, ceil_mode) - oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - # even though there is no folding in the current hlslib op, - # insert a time multiplexing axis to remain compatible with the - # shapes produced by the rest of the dataflow pipeline - ifm_ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - nf = int(ifm_ch / pe) - ret = list(self.get_normal_output_shape()) - if self.is_1d(): - ret[-1] = nf - ret.append(pe) - else: - ret.insert(-1, 1) - return tuple(ret) - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def get_exp_cycles(self): - # derived from StreamingMaxPool_Batch loop nest - ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() - - warnings.warn( - """Estimated latency for layer {} can be lower than - actual latency!""".format( - self.onnx_node.name - ) - ) - if self.is_1d(): - _, _, _, nf, _ = self.get_folded_output_shape() - ceil_mode = self.get_nodeattr("CeilMode") - ofm_dim = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) - exp_cycles = ofm_dim * nf * (k[1] + 1) - return int(exp_cycles) - else: - # TODO: adjust inaccurate formula - return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1]))) - - def get_instream_width(self, ind=0): - dt_bits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - ifm_ch = self.get_nodeattr("NumChannels") - if self.is_1d(): - in_width = int(dt_bits * pe) - else: - in_width = int(dt_bits * ifm_ch) - return in_width - - def get_outstream_width(self, ind=0): - """For streaming maxpool out stream width is the same as in stream width""" - return self.get_instream_width() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for StreamingMaxPool." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("dataType", idt.name) - # data type stays the same - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify the number of inputs - if len(self.onnx_node.input) == 1: - info_messages.append("The number of inputs is correct") - else: - info_messages.append("""StreamingMaxPool_Batch needs 1 data input""") - - return info_messages - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] - - def defines(self, var): - numReps = 1 - ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() - ceil_mode = self.get_nodeattr("CeilMode") - output_size = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) - - if self.is_1d(): - self.code_gen_dict["$DEFINES$"] = [ - """#define ImgDim {}\n #define PoolDim {}\n - #define NumChannels {}\n #define PE {}\n #define OutputSize {} - \n #define numReps {}""".format( - ifm_dim[1], - k[1], - self.get_nodeattr("NumChannels"), - self.get_nodeattr("PE"), - output_size, - numReps, - ) - ] - else: - self.code_gen_dict["$DEFINES$"] = [ - """#define ImgDim {}\n #define PoolDim {}\n - #define NumChannels {}\n #define numReps {}""".format( - ifm_dim[1], - k[1], - self.get_nodeattr("NumChannels"), - numReps, - ) - ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - dtype = self.get_input_datatype() - if dtype.bitwidth() == 1: - if self.is_1d(): - raise Exception("Binary 1d MaxPool not implemented on HLS backend") - else: - op = "StreamingMaxPool" - self.code_gen_dict["$DOCOMPUTE$"] = [ - "%s(in0_%s, out_%s);" - % (op, self.hls_sname(), self.hls_sname()) - ] - else: - dtype = self.get_input_datatype() - dtype_hls = dtype.get_hls_datatype_str() - minval_str = str(int(dtype.min())) - if self.is_1d(): - op = "StreamingMaxPool_Precision_1d" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """%s(in0_%s, out_%s);""" - % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname()) - ] - else: - op = "StreamingMaxPool_Precision" - self.code_gen_dict["$DOCOMPUTE$"] = [ - "%s(in0_%s, out_%s);" - % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname()) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - packed_hls_type, - self.hls_sname(), - packed_hls_type, - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim \ - did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output - shape doesn't match expected shape (1, ofm_dim, ofm_dim, ifm_ch).""" From 4770a30a07e2fd3dc8ccbb43c81067d439454853 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 11 Jan 2024 14:06:40 +0000 Subject: [PATCH 026/291] [CustomOp] Initial draft of lookup in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 2 +- .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../custom_op/fpgadataflow/hls/lookup_hls.py | 353 ++++++++++++++++++ src/finn/custom_op/fpgadataflow/lookup.py | 344 ++--------------- .../fpgadataflow/convert_to_hw_layers.py | 55 ++- .../fpgadataflow/test_fpgadataflow_lookup.py | 20 +- 6 files changed, 458 insertions(+), 318 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/hls/lookup_hls.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 68f565144f..bc9b9ae649 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -86,7 +86,6 @@ custom_op["VectorVectorActivation"] = VectorVectorActivation custom_op["IODMA"] = IODMA custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition -custom_op["Lookup"] = Lookup custom_op["StreamingConcat"] = StreamingConcat custom_op["CheckSum"] = CheckSum @@ -96,6 +95,7 @@ custom_op["DuplicateStreams"] = DuplicateStreams custom_op["GlobalAccPool"] = GlobalAccPool custom_op["LabelSelect"] = LabelSelect +custom_op["Lookup"] = Lookup custom_op["StreamingEltwise"] = StreamingEltwise custom_op["StreamingMaxPool"] = StreamingMaxPool custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index df58decf81..38d28a66d6 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -32,6 +32,7 @@ from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls +from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls from finn.custom_op.fpgadataflow.hls.streamingeltwise_hls import StreamingEltwise_hls from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls @@ -46,6 +47,7 @@ custom_op["FMPadding_hls"] = FMPadding_hls custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls custom_op["LabelSelect_hls"] = LabelSelect_hls +custom_op["Lookup_hls"] = Lookup_hls custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py new file mode 100644 index 0000000000..885d3039a4 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py @@ -0,0 +1,353 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from math import ceil, log2 +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.lookup import Lookup +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + + +class Lookup_hls(Lookup, HLSBackend): + "Streaming elementwise HLS lookup, mapping indices to values." + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(Lookup.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + mem_mode = self.get_nodeattr("mem_mode") + global_incls = [] + global_incls.append('#include "lookup.hpp"') + if mem_mode == "const": + global_incls.append('#include "embeddings.hpp"') + self.code_gen_dict["$GLOBALS$"] = global_incls + + def defines(self, var): + n_inputs = np.prod(self.get_folded_input_shape()[:-1]) + dtype = self.get_input_datatype() + elem_hls_type = dtype.get_hls_datatype_str() + emb_type = DataType[self.get_nodeattr("EmbeddingType")] + emb_hls_type = emb_type.get_hls_datatype_str() + emb_dim = self.get_nodeattr("EmbeddingDim") + mem_mode = self.get_nodeattr("mem_mode") + my_defines = [] + my_defines.append("#define NumInputs %d" % n_inputs) + if mem_mode == "external": + ext_mem_width = self.get_nodeattr("ext_mem_width") + ext_mem_emb_size = self.get_folded_output_shape()[-2] + ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) + my_defines.append("#define MemBits %d" % ext_mem_width) + my_defines.append("#define EmbeddingSize %d" % ext_mem_emb_size) + my_defines.append("#define EmbeddingAlign %d" % ext_mem_emb_align) + my_defines.append("#define T_SRC %s" % elem_hls_type) + my_defines.append("#define T_DST ap_uint") + elif mem_mode == "const": + my_defines.append("#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings")) + my_defines.append("#define EmbeddingDim %d" % emb_dim) + my_defines.append("#define InputType %s" % elem_hls_type) + my_defines.append("#define EmbeddingType %s" % emb_hls_type) + self.code_gen_dict["$DEFINES$"] = my_defines + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "int64_t" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", %s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + "false", + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """StreamingLookup(in0_%s, out_%s, embeddings);""" + % (self.hls_sname(), self.hls_sname()) + ] + elif mem_mode == "external": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """StreamingLookup_ext(in0_%s, out_%s, mem, size, oob_count, + oob_irq);""" + % (self.hls_sname(), self.hls_sname()) + ] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + ibits = self.get_instream_width() + packed_input_hls_type = "ap_uint<%d>" % ibits + obits = self.get_outstream_width() + packed_output_hls_type = "ap_uint<%d>" % obits + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_input_hls_type, + self.hls_sname(), + packed_output_hls_type, + self.hls_sname(), + ) + ] + elif mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void " + + self.onnx_node.name + + "(hls::stream &in0_%s, hls::stream &out_%s, " + % (self.hls_sname(), self.hls_sname()) + + "T_DST const *const mem, unsigned const size, " + + "unsigned &oob_count, bool &oob_irq)" + ] + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + my_pragmas = ["#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()] + my_pragmas.append("#pragma HLS INTERFACE axis port=out_" + self.hls_sname()) + my_pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return") + if mem_mode == "const": + my_pragmas.append("#pragma HLS BIND_STORAGE variable=embeddings type=ROM_2P impl=BRAM") + elif mem_mode == "external": + my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem") + my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control") + my_pragmas.append("#pragma HLS INTERFACE s_axilite port=size bundle=control") + my_pragmas.append("#pragma HLS INTERFACE s_axilite port=oob_count bundle=control") + my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq") + else: + raise Exception("Unrecognized mem_mode: " + mem_mode) + self.code_gen_dict["$PRAGMAS$"] = my_pragmas + + def generate_params(self, model, path): + mem_mode = self.get_nodeattr("mem_mode") + embeddings = model.get_initializer(self.onnx_node.input[1]) + if mem_mode == "const": + code_gen_dir = path + weight_filename = "{}/embeddings.hpp".format(code_gen_dir) + edt = DataType[self.get_nodeattr("EmbeddingType")] + # obits = self.get_outstream_width() + # packed_output_hls_type = "ap_uint<%d>" % obits + assert np.vectorize(edt.allowed)( + embeddings + ).all(), "Embeddings can't be expressed with type %s" % str(edt) + # reverse innertmost dim in embeddings to remain compatible with + # how we normally encode the data in FINN + embeddings_rev = np.flip(embeddings, -1) + embeddings_hls_code = numpy_to_hls_code(embeddings_rev, edt, "embeddings", True, False) + f_thresh = open(weight_filename, "w") + f_thresh.write(embeddings_hls_code) + f_thresh.close() + elif mem_mode == "external": + edt = DataType[self.get_nodeattr("EmbeddingType")] + ext_mem_width = self.get_nodeattr("ext_mem_width") + assert edt.bitwidth() == 8, ( + "Lookup with mem_mode=external " + + "only works with 8-bit embeddings but found " + + str(edt) + ) + emb_dim = self.get_nodeattr("EmbeddingDim") + # need to zero-pad embeddings in external mode for burst alignment + # compute how much padding we need + emb_elems_per_ext_mem_width = self.get_folded_output_shape()[-1] + ext_mem_emb_size = self.get_folded_output_shape()[-2] + ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) + align_factor = int((ext_mem_width / 8) * 2**ext_mem_emb_align) + pad_amount = align_factor - emb_dim + embeddings_padded = np.pad(embeddings, [(0, 0), (0, pad_amount)]) + # reshape for packing the innermost dim + embeddings_padded = embeddings_padded.reshape(-1, emb_elems_per_ext_mem_width) + weight_filename = "%s/%s.dat" % (path, self.onnx_node.name) + ret = pack_innermost_dim_as_hex_string( + embeddings_padded, edt, ext_mem_width, True, prefix="" + ) + with open(weight_filename, "w") as f: + for current_line in ret: + f.write(current_line + "\n") + else: + raise Exception("Unrecognized mem_mode: " + mem_mode) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = tuple(self.get_normal_input_shape()) + exp_oshape = tuple(self.get_normal_output_shape()) + folded_ishape = tuple(self.get_folded_input_shape()) + folded_oshape = tuple(self.get_folded_output_shape()) + mem_mode = self.get_nodeattr("mem_mode") + assert ( + mem_mode == "const" + ), "Only mem_mode=const is supported for simulation of Lookup layer" + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert inp.dtype == np.int64, "Inputs must be contained in int64 ndarray" + assert inp.shape == exp_ishape, """Input shape doesn't match expected shape.""" + export_idt = self.get_input_datatype() + odt = self.get_output_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, + out_npy_path, + odt, + out_shape, + packed_bits, + target_bits, + reverse_inner=True, + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def get_ap_int_max_w(self): + parent_max = super().get_ap_int_max_w() + mem_mode = self.get_nodeattr("mem_mode") + ext_mem_width = self.get_nodeattr("ext_mem_width") + if mem_mode == "external": + return max(ext_mem_width, parent_max) + else: + return parent_max diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py index 2dfca90ed9..367bda1f07 100644 --- a/src/finn/custom_op/fpgadataflow/lookup.py +++ b/src/finn/custom_op/fpgadataflow/lookup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,22 +27,19 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np -import os +import onnxruntime as rt import warnings -from math import ceil, log2 +from math import ceil +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class Lookup(HLSCustomOp): - "Streaming elementwise HLS lookup, mapping indices to values." +class Lookup(HWCustomOp): + """Abstraction layer for HW implementation of streaming elementwise lookup, + mapping indices to values.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -156,295 +153,37 @@ def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - mem_mode = self.get_nodeattr("mem_mode") - global_incls = [] - global_incls.append('#include "lookup.hpp"') - if mem_mode == "const": - global_incls.append('#include "embeddings.hpp"') - self.code_gen_dict["$GLOBALS$"] = global_incls - - def defines(self, var): - n_inputs = np.prod(self.get_folded_input_shape()[:-1]) - dtype = self.get_input_datatype() - elem_hls_type = dtype.get_hls_datatype_str() - emb_type = DataType[self.get_nodeattr("EmbeddingType")] - emb_hls_type = emb_type.get_hls_datatype_str() - emb_dim = self.get_nodeattr("EmbeddingDim") - mem_mode = self.get_nodeattr("mem_mode") - my_defines = [] - my_defines.append("#define NumInputs %d" % n_inputs) - if mem_mode == "external": - ext_mem_width = self.get_nodeattr("ext_mem_width") - ext_mem_emb_size = self.get_folded_output_shape()[-2] - ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) - my_defines.append("#define MemBits %d" % ext_mem_width) - my_defines.append("#define EmbeddingSize %d" % ext_mem_emb_size) - my_defines.append("#define EmbeddingAlign %d" % ext_mem_emb_align) - my_defines.append("#define T_SRC %s" % elem_hls_type) - my_defines.append("#define T_DST ap_uint") - elif mem_mode == "const": - my_defines.append("#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings")) - my_defines.append("#define EmbeddingDim %d" % emb_dim) - my_defines.append("#define InputType %s" % elem_hls_type) - my_defines.append("#define EmbeddingType %s" % emb_hls_type) - self.code_gen_dict["$DEFINES$"] = my_defines - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "int64_t" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", %s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - "false", - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """StreamingLookup(in0_%s, out_%s, embeddings);""" - % (self.hls_sname(), self.hls_sname()) - ] - elif mem_mode == "external": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """StreamingLookup_ext(in0_%s, out_%s, mem, size, oob_count, - oob_irq);""" - % (self.hls_sname(), self.hls_sname()) - ] - - def blackboxfunction(self): - mem_mode = self.get_nodeattr("mem_mode") - ibits = self.get_instream_width() - packed_input_hls_type = "ap_uint<%d>" % ibits - obits = self.get_outstream_width() - packed_output_hls_type = "ap_uint<%d>" % obits - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - packed_input_hls_type, - self.hls_sname(), - packed_output_hls_type, - self.hls_sname(), - ) - ] - elif mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void " - + self.onnx_node.name - + "(hls::stream &in0_%s, hls::stream &out_%s, " - % (self.hls_sname(), self.hls_sname()) - + "T_DST const *const mem, unsigned const size, " - + "unsigned &oob_count, bool &oob_irq)" - ] - - def pragmas(self): - mem_mode = self.get_nodeattr("mem_mode") - my_pragmas = ["#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()] - my_pragmas.append("#pragma HLS INTERFACE axis port=out_" + self.hls_sname()) - my_pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return") - if mem_mode == "const": - my_pragmas.append("#pragma HLS BIND_STORAGE variable=embeddings type=ROM_2P impl=BRAM") - elif mem_mode == "external": - my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem") - my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control") - my_pragmas.append("#pragma HLS INTERFACE s_axilite port=size bundle=control") - my_pragmas.append("#pragma HLS INTERFACE s_axilite port=oob_count bundle=control") - my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq") - else: - raise Exception("Unrecognized mem_mode: " + mem_mode) - self.code_gen_dict["$PRAGMAS$"] = my_pragmas - - def generate_params(self, model, path): - mem_mode = self.get_nodeattr("mem_mode") - embeddings = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "const": - code_gen_dir = path - weight_filename = "{}/embeddings.hpp".format(code_gen_dir) - edt = DataType[self.get_nodeattr("EmbeddingType")] - # obits = self.get_outstream_width() - # packed_output_hls_type = "ap_uint<%d>" % obits - assert np.vectorize(edt.allowed)( - embeddings - ).all(), "Embeddings can't be expressed with type %s" % str(edt) - # reverse innertmost dim in embeddings to remain compatible with - # how we normally encode the data in FINN - embeddings_rev = np.flip(embeddings, -1) - embeddings_hls_code = numpy_to_hls_code(embeddings_rev, edt, "embeddings", True, False) - f_thresh = open(weight_filename, "w") - f_thresh.write(embeddings_hls_code) - f_thresh.close() - elif mem_mode == "external": - edt = DataType[self.get_nodeattr("EmbeddingType")] - ext_mem_width = self.get_nodeattr("ext_mem_width") - assert edt.bitwidth() == 8, ( - "Lookup with mem_mode=external " - + "only works with 8-bit embeddings but found " - + str(edt) - ) - emb_dim = self.get_nodeattr("EmbeddingDim") - # need to zero-pad embeddings in external mode for burst alignment - # compute how much padding we need - emb_elems_per_ext_mem_width = self.get_folded_output_shape()[-1] - ext_mem_emb_size = self.get_folded_output_shape()[-2] - ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) - align_factor = int((ext_mem_width / 8) * 2**ext_mem_emb_align) - pad_amount = align_factor - emb_dim - embeddings_padded = np.pad(embeddings, [(0, 0), (0, pad_amount)]) - # reshape for packing the innermost dim - embeddings_padded = embeddings_padded.reshape(-1, emb_elems_per_ext_mem_width) - weight_filename = "%s/%s.dat" % (path, self.onnx_node.name) - ret = pack_innermost_dim_as_hex_string( - embeddings_padded, edt, ext_mem_width, True, prefix="" - ) - with open(weight_filename, "w") as f: - for current_line in ret: - f.write(current_line + "\n") - else: - raise Exception("Unrecognized mem_mode: " + mem_mode) - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # create a standard add node to help calculate the result node = self.onnx_node - exp_ishape = tuple(self.get_normal_input_shape()) - exp_oshape = tuple(self.get_normal_output_shape()) - folded_ishape = tuple(self.get_folded_input_shape()) - folded_oshape = tuple(self.get_folded_output_shape()) - mem_mode = self.get_nodeattr("mem_mode") - assert ( - mem_mode == "const" - ), "Only mem_mode=const is supported for simulation of Lookup layer" - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert inp.dtype == np.int64, "Inputs must be contained in int64 ndarray" - assert inp.shape == exp_ishape, """Input shape doesn't match expected shape.""" - export_idt = self.get_input_datatype() - odt = self.get_output_datatype() + inp_values = context[node.input[0]] + ishape = inp_values.shape + data_values = context[node.input[1]] + dshape = data_values.shape + oshape = context[node.output[0]].shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.INT64, ishape) + data = helper.make_tensor_value_info(node.input[1], TensorProto.FLOAT, dshape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + node_gather = helper.make_node( + "Gather", + inputs=[node.input[1], node.input[0]], + outputs=[node.output[0]], + ) + graph_gather = helper.make_graph( + nodes=[node_gather], + name="single-gather-exec", + inputs=[data, inp], + outputs=[outp], + ) - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == folded_oshape - ), "cppsim did not produce expected folded output shape" - context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, - out_npy_path, - odt, - out_shape, - packed_bits, - target_bits, - reverse_inner=True, - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" + opset_version = 13 + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_gather = qonnx_make_model(graph_gather, **onnx_kwargs) + idict = {node.input[0]: inp_values, node.input[1]: data_values} + sess = rt.InferenceSession(model_gather.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) def bram_estimation(self): mem_mode = self.get_nodeattr("mem_mode") @@ -466,15 +205,6 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 18 * 1024 return ebits / bram16_est_capacity - def get_ap_int_max_w(self): - parent_max = super().get_ap_int_max_w() - mem_mode = self.get_nodeattr("mem_mode") - ext_mem_width = self.get_nodeattr("ext_mem_width") - if mem_mode == "external": - return max(ext_mem_width, parent_max) - else: - return parent_max - def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 11bd3406d5..16ed2cfd9a 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -664,6 +664,59 @@ def apply(self, model): return (model, graph_modified) +class InferLookupLayer(Transformation): + """Convert Gather nodes with constant op0 into Lookup HW layers.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Gather": + emb_name = node.input[0] + embs = model.get_initializer(emb_name) + axis = get_by_name(node.attribute, "axis") + # skip conversion if input0 is not constant + if embs is None: + continue + # skip conversion if axis != 0 + if axis is not None and axis.i != 0: + continue + ind_name = node.input[1] + ind_dtype = model.get_tensor_datatype(ind_name) + emb_dtype = model.get_tensor_datatype(emb_name) + # skip conversion if inputs are not unsigned integers + if (not ind_dtype.is_integer()) or ind_dtype.signed(): + continue + num_embs, emb_dim = embs.shape + out_name = node.output[0] + ishape = model.get_tensor_shape(node.input[1]) + # create and insert new Lookup node + new_node = helper.make_node( + "Lookup", + [ind_name, emb_name], + [out_name], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + name="Lookup_" + node.name, + NumEmbeddings=num_embs, + EmbeddingDim=emb_dim, + EmbeddingType=emb_dtype.name, + InputType=ind_dtype.name, + InputShape=list(ishape), + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferStreamingEltwise(Transformation): """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer with SubEltwise or AbsDiffEltwise op.""" diff --git a/tests/fpgadataflow/test_fpgadataflow_lookup.py b/tests/fpgadataflow/test_fpgadataflow_lookup.py index d2861261b6..cb15fa3ae5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_lookup.py +++ b/tests/fpgadataflow/test_fpgadataflow_lookup.py @@ -1,5 +1,5 @@ # Copyright (C) 2021-2022, Xilinx, Inc. -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,13 +44,14 @@ from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferLookupLayer +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferLookupLayer from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN export_onnx_path = "test_lookup.onnx" @@ -121,12 +122,17 @@ def test_fpgadataflow_lookup(edt, embedding_cfg, exec_mode): ret = execute_onnx(model, {iname: itensor}) exp_out = np.take(embeddings, itensor, axis=0) assert (exp_out == ret[oname]).all() - # call transformation to convert to HLS and verify conversion + # call transformation to convert to HW layer and verify conversion model = model.transform(InferLookupLayer()) assert model.graph.node[0].op_type == "Lookup" assert model.graph.node[0].input[0] == iname assert model.graph.node[0].input[1] == ename assert model.graph.node[0].output[0] == oname + ret_hw = execute_onnx(model, {iname: itensor}) + assert (exp_out == ret_hw[oname]).all() + # call transformation to convert abstraction layer into HLS layer + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "Lookup_hls" if exec_mode == "cppsim": model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareCppSim()) @@ -166,14 +172,10 @@ def test_fpgadataflow_lookup_external(): assert tuple(model.get_tensor_shape(ename)) == eshape assert tuple(model.get_tensor_shape(oname)) == exp_oshape assert (model.get_initializer(ename) == embeddings).all() - # itensor = gen_finn_dt_tensor(idt, ishape).astype(np.int64) - # itensor = np.clip(itensor, 0, num_embeddings - 1) - # ret = execute_onnx(model, {iname: itensor}) - # exp_out = np.take(embeddings, itensor, axis=0) - # assert (exp_out == ret[oname]).all() - # call transformation to convert to HLS and verify conversion model = model.transform(InferLookupLayer()) assert model.graph.node[0].op_type == "Lookup" + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "Lookup_hls" assert model.graph.node[0].input[0] == iname assert model.graph.node[0].input[1] == ename assert model.graph.node[0].output[0] == oname From 68e1442361583b394166c3da60aef938806a7038 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 11 Jan 2024 14:10:55 +0000 Subject: [PATCH 027/291] Delete old upsampler custom op and fix typo in comment --- src/finn/custom_op/fpgadataflow/upsampler.py | 2 +- .../custom_op/fpgadataflow/upsampler_batch.py | 351 ------------------ 2 files changed, 1 insertion(+), 352 deletions(-) delete mode 100644 src/finn/custom_op/fpgadataflow/upsampler_batch.py diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py index b0264ffa8a..3348394e05 100644 --- a/src/finn/custom_op/fpgadataflow/upsampler.py +++ b/src/finn/custom_op/fpgadataflow/upsampler.py @@ -149,7 +149,7 @@ def get_number_output_values(self): return np.prod(folded_oshape[:-1]) def execute_node(self, context, graph): - # create a standard add node to help calculate the result + # create a standard resize node to help calculate the result node = self.onnx_node inp_values = context[node.input[0]] ishape = inp_values.shape diff --git a/src/finn/custom_op/fpgadataflow/upsampler_batch.py b/src/finn/custom_op/fpgadataflow/upsampler_batch.py deleted file mode 100644 index 9c0db1f3df..0000000000 --- a/src/finn/custom_op/fpgadataflow/upsampler_batch.py +++ /dev/null @@ -1,351 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - - -class UpsampleNearestNeighbour_Batch(HLSCustomOp): - """ - Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function. - Upsampling is done with the Nearest Neighbour algorithm. - The layer expects square feature maps for the in and output. - """ - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = { - # Size of the output feature map - "OFMDim": ("i", True, 0), - # Size of the input feature map - "IFMDim": ("i", True, 0), - # Amount of channels of the input feature map - "NumChannels": ("i", True, 0), - # FINN input datatype - "inputDataType": ("s", True, ""), - # Batch size - "numInputVectors": ("i", False, 1), - # Dimensionality mode: 0 = 2D square, 1 = 1D in H dim - "DimMode": ("i", False, 0), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_exp_cycles(self): - OFMDim = self.get_nodeattr("OFMDim") - batch_size = self.get_nodeattr("numInputVectors") - is_2d = self.get_nodeattr("DimMode") == 0 - reps = 1 - if is_2d: - OFMDim = OFMDim * OFMDim - reps = batch_size - exp_cycles = OFMDim * reps - return int(exp_cycles) - - def get_normal_input_shape(self, ind=0): - IFMDim = self.get_nodeattr("IFMDim") - num_ch = self.get_nodeattr("NumChannels") - batch = self.get_nodeattr("numInputVectors") - is_2d = self.get_nodeattr("DimMode") == 0 - if is_2d: - ishape = (batch, IFMDim, IFMDim, num_ch) - else: - ishape = (batch, IFMDim, 1, num_ch) - return ishape - - def get_normal_output_shape(self, ind=0): - OFMDim = self.get_nodeattr("OFMDim") - num_ch = self.get_nodeattr("NumChannels") - batch = self.get_nodeattr("numInputVectors") - is_2d = self.get_nodeattr("DimMode") == 0 - if is_2d: - oshape = (batch, OFMDim, OFMDim, num_ch) - else: - oshape = (batch, OFMDim, 1, num_ch) - return oshape - - def get_folded_input_shape(self, ind=0): - normal_ishape = list(self.get_normal_input_shape()) - return tuple(normal_ishape) - - def get_folded_output_shape(self, ind=0): - normal_oshape = list(self.get_normal_output_shape()) - return tuple(normal_oshape) - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for UpsampleNearestNeighbour_Batch." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # data type stays the same - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - ret = DataType[self.get_nodeattr("inputDataType")] - return ret - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output. (Same as input datatype)""" - return self.get_input_datatype() - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - ifm_ch = self.get_nodeattr("NumChannels") - return ibits * ifm_ch - - def get_outstream_width(self, ind=0): - obits = self.get_output_datatype().bitwidth() - ifm_ch = self.get_nodeattr("NumChannels") - return obits * ifm_ch - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - ifm_ch = self.get_nodeattr("NumChannels") - self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] - - ibits = self.get_input_datatype().bitwidth() - self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] - - idim = self.get_nodeattr("IFMDim") - self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] - - odim = self.get_nodeattr("OFMDim") - self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] - - batch_size = self.get_nodeattr("numInputVectors") - self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - is_2d = self.get_nodeattr("DimMode") == 0 - batch = self.get_nodeattr("numInputVectors") - if is_2d: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """UpsampleNearestNeighbour_Batch > (in0_%s, out_%s, numReps);""" - % (self.hls_sname(), self.hls_sname()) - ] - else: - assert batch == 1, "1D upsampler currently needs numReps=1" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """UpsampleNearestNeighbour_1D > (in0_%s, out_%s);""" - % (self.hls_sname(), self.hls_sname()) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - packed_hls_type, - self.hls_sname(), - packed_hls_type, - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_oshape = self.get_folded_output_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" - export_idt = self.get_input_datatype() - self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == folded_oshape - ), "cppsim did not produce expected folded output shape" - context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim, OutputDim, NumChannels).""" From 9674cba6c3d5f3c0292e121e0e9d8957b65316a2 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 11 Jan 2024 14:15:15 +0000 Subject: [PATCH 028/291] [Tests] Temporarily marking hls conversion tests as xfail --- tests/end2end/test_end2end_mobilenet_v1.py | 1 + tests/transformation/test_infer_data_layouts_cnv.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index 2d25a2bf0d..512558eb09 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -211,6 +211,7 @@ def test_end2end_mobilenet_lowering(): @pytest.mark.end2end +@pytest.mark.xfail def test_end2end_mobilenet_convert_to_hls_layers(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_lowered.onnx") model = model.transform(to_hls.InferPool_Batch()) diff --git a/tests/transformation/test_infer_data_layouts_cnv.py b/tests/transformation/test_infer_data_layouts_cnv.py index 25bf890271..2d7fc54f94 100644 --- a/tests/transformation/test_infer_data_layouts_cnv.py +++ b/tests/transformation/test_infer_data_layouts_cnv.py @@ -56,6 +56,7 @@ @pytest.mark.transform +@pytest.mark.xfail def test_infer_data_layouts_cnv(): cnv = get_test_model_trained("CNV", 1, 1) export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path_cnv) From d9819a24d4289bebb6ae61bda7ed3899c44ab0f8 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 11 Jan 2024 16:27:21 +0000 Subject: [PATCH 029/291] [CustomOp] Initial draft of dwc in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 12 +- .../custom_op/fpgadataflow/hls/__init__.py | 4 + .../hls/streamingdatawidthconverter_hls.py | 271 +++++++++ .../custom_op/fpgadataflow/rtl/__init__.py | 4 + .../streamingdatawidthconverter_rtl.py | 157 +---- .../streamingdatawidthconverter.py | 216 +++++++ .../streamingdatawidthconverter_batch.py | 540 ------------------ .../transformation/fpgadataflow/floorplan.py | 2 +- .../transformation/fpgadataflow/insert_dwc.py | 60 +- .../fpgadataflow/specialize_layers.py | 30 + tests/fpgadataflow/test_fpgadataflow_dwc.py | 108 +++- 11 files changed, 648 insertions(+), 756 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py rename src/finn/custom_op/fpgadataflow/{ => rtl}/streamingdatawidthconverter_rtl.py (63%) create mode 100644 src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py delete mode 100644 src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index bc9b9ae649..e4b645bbc2 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -1,5 +1,5 @@ # Copyright (C) 2020-2022, Xilinx, Inc. -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -53,11 +53,8 @@ from finn.custom_op.fpgadataflow.streamingdataflowpartition import ( StreamingDataflowPartition, ) -from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import ( - StreamingDataWidthConverter_Batch, -) -from finn.custom_op.fpgadataflow.streamingdatawidthconverter_rtl import ( - StreamingDataWidthConverter_rtl, +from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( + StreamingDataWidthConverter, ) from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO @@ -77,8 +74,6 @@ custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl custom_op["TLastMarker"] = TLastMarker -custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch -custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl custom_op["StreamingFIFO"] = StreamingFIFO custom_op["Pool_Batch"] = Pool_Batch custom_op["FMPadding_Pixel"] = FMPadding_Pixel @@ -96,6 +91,7 @@ custom_op["GlobalAccPool"] = GlobalAccPool custom_op["LabelSelect"] = LabelSelect custom_op["Lookup"] = Lookup +custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter custom_op["StreamingEltwise"] = StreamingEltwise custom_op["StreamingMaxPool"] = StreamingMaxPool custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 38d28a66d6..1803b00023 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -33,6 +33,9 @@ from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls +from finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls import ( + StreamingDataWidthConverter_hls, +) from finn.custom_op.fpgadataflow.hls.streamingeltwise_hls import StreamingEltwise_hls from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls @@ -49,5 +52,6 @@ custom_op["LabelSelect_hls"] = LabelSelect_hls custom_op["Lookup_hls"] = Lookup_hls custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls +custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py new file mode 100644 index 0000000000..be096e63c7 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py @@ -0,0 +1,271 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( + StreamingDataWidthConverter, +) +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +# does not do anything at the ONNX node-by-node level, and input-output +# tensor shapes are the same. performs data width conversion at the rtlsim level + + +class StreamingDataWidthConverter_hls(StreamingDataWidthConverter, HLSBackend): + """Class that corresponds to finn-hlslib StreamingDataWidthConverter_Batch + function.""" + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingDataWidthConverter.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + numReps = 1 + numInWords = int(np.prod(self.get_folded_input_shape()[:-1])) + inWidth = self.get_nodeattr("inWidth") + outWidth = self.get_nodeattr("outWidth") + self.code_gen_dict["$DEFINES$"] = [ + "#define InWidth %d " % inWidth, + "#define OutWidth %d " % outWidth, + "#define NumInWords %d " % numInWords, + "#define numReps %d" % numReps, + ] + if self.needs_lcm(): + lcmWidth = self.get_iowidth_lcm() + assert numInWords % (lcmWidth / inWidth) == 0, "Error in DWC LCM calculation" + numLCMToOut = numInWords // (lcmWidth / inWidth) + self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth) + self.code_gen_dict["$DEFINES$"].append("#define NumLCMToOut %d" % (numLCMToOut)) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + if self.needs_lcm(): + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> intermediate ("intermediate");'.format( + self.get_iowidth_lcm() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + # TODO continue with fxns below, they are copy-pasted + op = "StreamingDataWidthConverter_Batch" + if self.needs_lcm(): + self.code_gen_dict["$DOCOMPUTE$"] = [ + 'hls::stream> intermediate ("intermediate");'.format( + self.get_iowidth_lcm() + ), + "%s(in0_%s, intermediate, numReps);" + % (op, self.hls_sname()), + "%s(intermediate, out_%s, numReps);" + % (op, self.hls_sname()), + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + "%s(in0_%s, out_%s, numReps);" + % (op, self.hls_sname(), self.hls_sname()) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + in_packed_bits = self.get_instream_width() + in_packed_hls_type = "ap_uint<%d>" % in_packed_bits + out_packed_bits = self.get_outstream_width() + out_packed_hls_type = "ap_uint<%d>" % out_packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + in_packed_hls_type, + self.hls_sname(), + out_packed_hls_type, + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + if self.needs_lcm(): + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW disable_start_propagation") + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_shape = self.get_normal_input_shape() + folded_ishape = self.get_folded_input_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape." + + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # reshape input into folded shape + reshaped_input = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = reshaped_input.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + context[node.output[0]] = output + + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(exp_shape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to "rtlsim" """.format( + mode + ) + ) + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert context[node.output[0]].shape == tuple( + exp_shape + ), """Output + shape doesn't match expected shape, should be same as input shape""" diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 7c9b2eaf22..81110d8b9f 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -27,9 +27,13 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from finn.custom_op.fpgadataflow.rtl.fmpadding_rtl import FMPadding_rtl +from finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl import ( + StreamingDataWidthConverter_rtl, +) custom_op = dict() # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure custom_op["FMPadding_rtl"] = FMPadding_rtl +custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py similarity index 63% rename from src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_rtl.py rename to src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py index 4f592bafaa..2d17897afe 100644 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,10 +29,11 @@ import numpy as np import os import shutil -import warnings -from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( + StreamingDataWidthConverter, +) from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy @@ -42,41 +43,19 @@ PyVerilator = None -class StreamingDataWidthConverter_rtl(HLSCustomOp): +class StreamingDataWidthConverter_rtl(StreamingDataWidthConverter, RTLBackend): """Class that corresponds to finn-rtllib datawidth converter module.""" def get_nodeattr_types(self): my_attrs = { - # shape of input/output tensors - "shape": ("ints", True, []), - # bit width of input and output streams - "inWidth": ("i", True, 0), - "outWidth": ("i", True, 0), - # FINN DataTypes for inputs/outputs - "dataType": ("s", True, ""), # attribute to save top module name - not user configurable "gen_top_module": ("s", False, ""), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(StreamingDataWidthConverter.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("dataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("dataType")] - - def get_normal_input_shape(self, ind=0): - ishape = self.get_nodeattr("shape") - return ishape - - def get_normal_output_shape(self, ind=0): - oshape = self.get_nodeattr("shape") - return oshape - def check_divisible_iowidths(self): iwidth = self.get_nodeattr("inWidth") owidth = self.get_nodeattr("outWidth") @@ -95,83 +74,6 @@ def check_divisible_iowidths(self): owidth, ) - def get_folded_input_shape(self, ind=0): - self.check_divisible_iowidths() - iwidth = self.get_nodeattr("inWidth") - ishape = self.get_normal_input_shape() - dummy_t = np.random.randn(*ishape) - ibits = self.get_input_datatype().bitwidth() - assert ( - iwidth % ibits == 0 - ), """DWC input width must be divisible by - input element bitwidth""" - ielems = int(iwidth // ibits) - ichannels = ishape[-1] - new_shape = [] - for i in ishape[:-1]: - new_shape.append(i) - new_shape.append(int(ichannels // ielems)) - new_shape.append(ielems) - dummy_t = dummy_t.reshape(new_shape) - return dummy_t.shape - - def get_folded_output_shape(self, ind=0): - self.check_divisible_iowidths() - owidth = self.get_nodeattr("outWidth") - oshape = self.get_normal_output_shape() - dummy_t = np.random.randn(*oshape) - obits = self.get_output_datatype().bitwidth() - assert ( - owidth % obits == 0 - ), """DWC output width must be divisible by - input element bitwidth""" - oelems = int(owidth // obits) - ochannels = oshape[-1] - new_shape = [] - for i in oshape[:-1]: - new_shape.append(i) - new_shape.append(int(ochannels // oelems)) - new_shape.append(oelems) - dummy_t = dummy_t.reshape(new_shape) - - return dummy_t.shape - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def get_instream_width(self, ind=0): - in_width = self.get_nodeattr("inWidth") - return in_width - - def get_outstream_width(self, ind=0): - out_width = self.get_nodeattr("outWidth") - return out_width - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingDWC." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("dataType", idt.name) - # data type stays the same - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - pass - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node @@ -316,46 +218,3 @@ def code_generation_ipi(self): % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) ] return cmd - - def code_generation_ipgen(self, model, fpgapart, clk): - """Normally: Generates C++ code and tcl script for IP generation. - Here: Generates (System-)Verilog code for IP generation.""" - self.generate_hdl() - - def ipgen_singlenode_code(self): - """Normally: Builds the bash script for IP generation.""" - pass - - def code_generation_cppsim(self, model): - """Normally: Generates C++ code for simulation (cppsim).""" - pass - - def compile_singlenode_code(self): - pass - - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - - def pragmas(self): - pass diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py new file mode 100644 index 0000000000..4921caeb00 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py @@ -0,0 +1,216 @@ +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + +# does not do anything at the ONNX node-by-node level, and input-output +# tensor shapes are the same. performs data width conversion at the rtlsim level + + +class StreamingDataWidthConverter(HWCustomOp): + """Abstraction layer for HW implementation of StreamingDataWidthConverter""" + + def get_nodeattr_types(self): + my_attrs = { + # shape of input/output tensors + "shape": ("ints", True, []), + # bit width of input and output streams + "inWidth": ("i", True, 0), + "outWidth": ("i", True, 0), + # FINN DataTypes for inputs/outputs + "dataType": ("s", True, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("dataType")] + + def get_normal_input_shape(self, ind=0): + ishape = self.get_nodeattr("shape") + return ishape + + def get_normal_output_shape(self, ind=0): + oshape = self.get_nodeattr("shape") + return oshape + + def get_iowidth_lcm(self): + iwidth = self.get_nodeattr("inWidth") + owidth = self.get_nodeattr("outWidth") + return int(np.lcm(iwidth, owidth)) + + def needs_lcm(self): + iwidth = self.get_nodeattr("inWidth") + owidth = self.get_nodeattr("outWidth") + maxwidth = max(iwidth, owidth) + minwidth = min(iwidth, owidth) + return maxwidth % minwidth != 0 + + def check_divisible_iowidths(self): + pass + + def get_folded_input_shape(self, ind=0): + self.check_divisible_iowidths() + iwidth = self.get_nodeattr("inWidth") + ishape = self.get_normal_input_shape() + dummy_t = np.random.randn(*ishape) + ibits = self.get_input_datatype().bitwidth() + assert ( + iwidth % ibits == 0 + ), """DWC input width must be divisible by + input element bitwidth""" + ielems = int(iwidth // ibits) + ichannels = ishape[-1] + new_shape = [] + for i in ishape[:-1]: + new_shape.append(i) + new_shape.append(int(ichannels // ielems)) + new_shape.append(ielems) + dummy_t = dummy_t.reshape(new_shape) + return dummy_t.shape + + def get_folded_output_shape(self, ind=0): + self.check_divisible_iowidths() + owidth = self.get_nodeattr("outWidth") + oshape = self.get_normal_output_shape() + dummy_t = np.random.randn(*oshape) + obits = self.get_output_datatype().bitwidth() + assert ( + owidth % obits == 0 + ), """DWC output width must be divisible by + input element bitwidth""" + oelems = int(owidth // obits) + ochannels = oshape[-1] + new_shape = [] + for i in oshape[:-1]: + new_shape.append(i) + new_shape.append(int(ochannels // oelems)) + new_shape.append(oelems) + dummy_t = dummy_t.reshape(new_shape) + + return dummy_t.shape + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def get_instream_width(self, ind=0): + in_width = self.get_nodeattr("inWidth") + return in_width + + def get_outstream_width(self, ind=0): + out_width = self.get_nodeattr("outWidth") + return out_width + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingDWC." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("dataType", idt.name) + # data type stays the same + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""StreamingDWC needs 1 data input""") + + return info_messages + + def execute_node(self, context, graph): + node = self.onnx_node + exp_shape = self.get_normal_input_shape() + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape." + + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + context[node.output[0]] = output + + def lut_estimation(self): + """Calculates resource estimations for LUTs""" + inw = self.get_instream_width() + outw = self.get_outstream_width() + + minw = min(inw, outw) + maxw = max(inw, outw) + + # sometimes widths aren't directly divisible + # this requires going up from input width to least common multiple + # then down to output width + intw = abs(maxw * minw) // math.gcd(maxw, minw) + + # we assume a shift-based implementation + # even if we don't use LUTs explicitly, we make some unavailable + # to other logic because they're tied into the DWC control sets + + cnt_luts = 0 + cset_luts = 0 + + if inw != intw: + cnt_luts += abs(math.ceil(math.log(inw / intw, 2))) + cset_luts += intw + if intw != outw: + cnt_luts += abs(math.ceil(math.log(intw / outw, 2))) + cset_luts += outw + + return int(cnt_luts + cset_luts) diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py deleted file mode 100644 index baf4aed502..0000000000 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py +++ /dev/null @@ -1,540 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import math -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - -# does not do anything at the ONNX node-by-node level, and input-output -# tensor shapes are the same. performs data width conversion at the rtlsim level - - -class StreamingDataWidthConverter_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib StreamingDataWidthConverter_Batch - function.""" - - def get_nodeattr_types(self): - my_attrs = { - # shape of input/output tensors - "shape": ("ints", True, []), - # bit width of input and output streams - "inWidth": ("i", True, 0), - "outWidth": ("i", True, 0), - # FINN DataTypes for inputs/outputs - "dataType": ("s", True, ""), - # Toggle between hls or IPI implementation - # hls - use the hls generated IP during stitching - # vivado - use the AXI Infrastructure DWC - "impl_style": ("s", False, "hls", {"hls", "vivado"}), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("dataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("dataType")] - - def get_normal_input_shape(self, ind=0): - ishape = self.get_nodeattr("shape") - return ishape - - def get_normal_output_shape(self, ind=0): - oshape = self.get_nodeattr("shape") - return oshape - - def check_divisible_iowidths(self): - impl_style = self.get_nodeattr("impl_style") - iwidth = self.get_nodeattr("inWidth") - owidth = self.get_nodeattr("outWidth") - if impl_style == "vivado": - # the AXIS IP we use in vivado mode only supports - # stream widths that are divisible by 8 - iwidth_d8 = iwidth % 8 == 0 - owidth_d8 = owidth % 8 == 0 - assert ( - iwidth_d8 and owidth_d8 - ), """DWC impl_style=vivado requires - stream widths that are divisible by 8: (%d, %d)""" % ( - iwidth, - owidth, - ) - - def get_iowidth_lcm(self): - iwidth = self.get_nodeattr("inWidth") - owidth = self.get_nodeattr("outWidth") - return int(np.lcm(iwidth, owidth)) - - def needs_lcm(self): - iwidth = self.get_nodeattr("inWidth") - owidth = self.get_nodeattr("outWidth") - maxwidth = max(iwidth, owidth) - minwidth = min(iwidth, owidth) - impl_style = self.get_nodeattr("impl_style") - return (impl_style == "hls") and (maxwidth % minwidth != 0) - - def get_folded_input_shape(self, ind=0): - self.check_divisible_iowidths() - iwidth = self.get_nodeattr("inWidth") - ishape = self.get_normal_input_shape() - dummy_t = np.random.randn(*ishape) - ibits = self.get_input_datatype().bitwidth() - assert ( - iwidth % ibits == 0 - ), """DWC input width must be divisible by - input element bitwidth""" - ielems = int(iwidth // ibits) - ichannels = ishape[-1] - new_shape = [] - for i in ishape[:-1]: - new_shape.append(i) - new_shape.append(int(ichannels // ielems)) - new_shape.append(ielems) - dummy_t = dummy_t.reshape(new_shape) - return dummy_t.shape - - def get_folded_output_shape(self, ind=0): - self.check_divisible_iowidths() - owidth = self.get_nodeattr("outWidth") - oshape = self.get_normal_output_shape() - dummy_t = np.random.randn(*oshape) - obits = self.get_output_datatype().bitwidth() - assert ( - owidth % obits == 0 - ), """DWC output width must be divisible by - input element bitwidth""" - oelems = int(owidth // obits) - ochannels = oshape[-1] - new_shape = [] - for i in oshape[:-1]: - new_shape.append(i) - new_shape.append(int(ochannels // oelems)) - new_shape.append(oelems) - dummy_t = dummy_t.reshape(new_shape) - - return dummy_t.shape - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def get_instream_width(self, ind=0): - in_width = self.get_nodeattr("inWidth") - return in_width - - def get_outstream_width(self, ind=0): - out_width = self.get_nodeattr("outWidth") - return out_width - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingDWC." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("dataType", idt.name) - # data type stays the same - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify the number of inputs - if len(self.onnx_node.input) == 1: - info_messages.append("The number of inputs is correct") - else: - info_messages.append("""StreamingDWC needs 1 data input""") - - return info_messages - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] - - def defines(self, var): - numReps = 1 - numInWords = int(np.prod(self.get_folded_input_shape()[:-1])) - inWidth = self.get_nodeattr("inWidth") - outWidth = self.get_nodeattr("outWidth") - self.code_gen_dict["$DEFINES$"] = [ - "#define InWidth %d " % inWidth, - "#define OutWidth %d " % outWidth, - "#define NumInWords %d " % numInWords, - "#define numReps %d" % numReps, - ] - if self.needs_lcm(): - lcmWidth = self.get_iowidth_lcm() - assert numInWords % (lcmWidth / inWidth) == 0, "Error in DWC LCM calculation" - numLCMToOut = numInWords // (lcmWidth / inWidth) - self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth) - self.code_gen_dict["$DEFINES$"].append("#define NumLCMToOut %d" % (numLCMToOut)) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - if self.needs_lcm(): - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> intermediate ("intermediate");'.format( - self.get_iowidth_lcm() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - # TODO continue with fxns below, they are copy-pasted - op = "StreamingDataWidthConverter_Batch" - if self.needs_lcm(): - self.code_gen_dict["$DOCOMPUTE$"] = [ - 'hls::stream> intermediate ("intermediate");'.format( - self.get_iowidth_lcm() - ), - "%s(in0_%s, intermediate, numReps);" - % (op, self.hls_sname()), - "%s(intermediate, out_%s, numReps);" - % (op, self.hls_sname()), - ] - else: - self.code_gen_dict["$DOCOMPUTE$"] = [ - "%s(in0_%s, out_%s, numReps);" - % (op, self.hls_sname(), self.hls_sname()) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - in_packed_bits = self.get_instream_width() - in_packed_hls_type = "ap_uint<%d>" % in_packed_bits - out_packed_bits = self.get_outstream_width() - out_packed_hls_type = "ap_uint<%d>" % out_packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - in_packed_hls_type, - self.hls_sname(), - out_packed_hls_type, - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - if self.needs_lcm(): - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW disable_start_propagation") - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - impl_style = self.get_nodeattr("impl_style") - node = self.onnx_node - exp_shape = self.get_normal_input_shape() - folded_ishape = self.get_folded_input_shape() - - # TODO ensure codegen dir exists - if mode == "cppsim": - assert impl_style == "hls", "DWC cppsim only possible when impl_style==hls" - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - assert impl_style == "hls", "DWC rtlsim only possible when impl_style==hls" - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape." - - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # reshape input into folded shape - reshaped_input = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = reshaped_input.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - output = inp - output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) - context[node.output[0]] = output - - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(exp_shape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to "rtlsim" """.format( - mode - ) - ) - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert context[node.output[0]].shape == tuple( - exp_shape - ), """Output - shape doesn't match expected shape, should be same as input shape""" - - def code_generation_ipi(self): - impl_style = self.get_nodeattr("impl_style") - if impl_style == "hls": - return super().code_generation_ipi() - elif impl_style == "vivado": - cmd = [] - node_name = self.onnx_node.name - # create a hierarchy for this layer, with the same port names - clk_name = self.get_verilog_top_module_intf_names()["clk"][0] - rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] - cmd.append("create_bd_cell -type hier %s" % node_name) - cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) - cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) - cmd.append( - "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) - ) - cmd.append( - "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) - ) - # instantiate and configure DWC - cmd.append( - "create_bd_cell -type ip " - "-vlnv xilinx.com:ip:axis_dwidth_converter:1.1 /%s/dwc" % node_name - ) - cmd.append( - "set_property -dict " - "[list CONFIG.S_TDATA_NUM_BYTES.VALUE_SRC USER] " - "[get_bd_cells /%s/dwc]" % node_name - ) - cmd.append( - "set_property -dict " - "[list CONFIG.S_TDATA_NUM_BYTES {%d}] [get_bd_cells /%s/dwc]" - % (np.ceil(self.get_instream_width() / 8), node_name) - ) - cmd.append( - "set_property -dict " - "[list CONFIG.M_TDATA_NUM_BYTES {%d}] [get_bd_cells /%s/dwc]" - % (np.ceil(self.get_outstream_width() / 8), node_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/dwc/M_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/dwc/S_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/dwc/aresetn]" - % (node_name, rst_name, node_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/dwc/aclk]" - % (node_name, clk_name, node_name) - ) - return cmd - else: - raise Exception( - "DWC implementation style %s not supported, please use hls or vivado" % impl_style - ) - - def lut_estimation(self): - """Calculates resource estimations for LUTs""" - inw = self.get_instream_width() - outw = self.get_outstream_width() - - minw = min(inw, outw) - maxw = max(inw, outw) - - # sometimes withs aren't directly divisible - # this requires going up from input width to least common multiple - # then down to output width - intw = abs(maxw * minw) // math.gcd(maxw, minw) - - # we assume a shift-based implementation - # even if we don't use LUTs explicitly, we make some unavailable - # to other logic because they're tied into the DWC control sets - - cnt_luts = 0 - cset_luts = 0 - - if inw != intw: - cnt_luts += abs(math.ceil(math.log(inw / intw, 2))) - cset_luts += intw - if intw != outw: - cnt_luts += abs(math.ceil(math.log(intw / outw, 2))) - cset_luts += outw - - return int(cnt_luts + cset_luts) - - def prepare_rtlsim(self): - assert self.get_nodeattr("impl_style") != "vivado", ( - "StreamingDataWidthConverter impl_style " - "cannot be vivado for rtlsim. Only impl_style=rtl supported." - ) - super().prepare_rtlsim() - - def code_generation_ipgen(self, model, fpgapart, clk): - # no codegen required for impl_style=vivado since - # that uses premade, configurable AXIS IP - if self.get_nodeattr("impl_style") == "hls": - super().code_generation_ipgen(model, fpgapart, clk) - - def ipgen_singlenode_code(self): - # no IP generation required for impl_style=vivado since - # that uses premade, configurable AXIS IP - if self.get_nodeattr("impl_style") == "hls": - super().ipgen_singlenode_code() - else: - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - # set ipgen_path and ip_path so that HLSSynthIP - # and CreatedStitchedIP transformations do not complain - self.set_nodeattr("ipgen_path", code_gen_dir) - self.set_nodeattr("ip_path", code_gen_dir) diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index 336b3f80d0..fce2c2264c 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -81,7 +81,7 @@ def apply(self, model): if node_slr == -1: unassigned_nodes += 1 node_inst.set_nodeattr("slr", default_slr) - if node.op_type == "StreamingDataWidthConverter_Batch": + if node.op_type.startswith("StreamingDataWidthConverter"): # if we have SLR assignment already. use that if node_slr != -1: continue diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index bf0254c1a7..ee4311a5a1 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -1,4 +1,31 @@ -import warnings +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from onnx import TensorProto from onnx import helper as oh from qonnx.custom_op.registry import getCustomOp @@ -8,7 +35,7 @@ def _is_dwc_node(node): - if node.op_type == "StreamingDataWidthConverter_Batch": + if node.op_type.startswith("StreamingDataWidthConverter"): return True else: return False @@ -34,9 +61,8 @@ def _suitable_node(node): class InsertDWC(Transformation): """Add data width converters between layers where necessary.""" - def __init__(self, use_rtl_variant=True): + def __init__(self): super().__init__() - self.use_rtl_variant = use_rtl_variant def apply(self, model): graph = model.graph @@ -50,7 +76,7 @@ def apply(self, model): if consumers == []: continue assert len(consumers) == 1, ( - n.name + ": HLS node with fan-out higher than 1 cannot be stitched" + n.name + ": HW node with fan-out higher than 1 cannot be stitched" ) consumer = consumers[0] if _suitable_node(consumer) is True: @@ -82,20 +108,7 @@ def apply(self, model): dwc_in_width = n0.get_outstream_width() # determine dwc outwidth dwc_out_width = n1.get_instream_width() - if self.use_rtl_variant: - # check if rtl variant can be used - iwidth_d = dwc_in_width % dwc_out_width == 0 - owidth_d = dwc_out_width % dwc_in_width == 0 - if iwidth_d or owidth_d: - node_optype = "StreamingDataWidthConverter_rtl" - else: - warnings.warn( - "DWC cannot be implemented as RTL variant, default to hls" - ) - node_optype = "StreamingDataWidthConverter_Batch" - self.use_rtl_variant = False - else: - node_optype = "StreamingDataWidthConverter_Batch" + node_optype = "StreamingDataWidthConverter" # determine shape for dwc dwc_shape = n0.get_normal_output_shape() @@ -121,15 +134,6 @@ def apply(self, model): outWidth=dwc_out_width, dataType=str(dtype.name), ) - # if not rtl variant is selected - # use hls mode by default since it supports more configs - # vivado mode can be manually enabled by user, but does not - # support e.g. node-by-node rtlsim neded for - # characterization-based FIFO sizing - if not self.use_rtl_variant: - impl_attr = oh.make_attribute("impl_style", "hls") - dwc_node.attribute.append(impl_attr) - # insert dwc graph.node.insert(node_ind + 1, dwc_node) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 4b2687faee..eff40f83f3 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -60,6 +60,8 @@ def _determine_impl_style(node): # if impl_style not set, for "simple" layers always try # to use rtl variant if available if impl_style == "": + if optype == "StreamingDataWidthConverter": + return _dwc_determine_impl_style(node) if rtl_variant: return "rtl" # but if no rtl variant, set impl_style to hls @@ -94,6 +96,20 @@ def _determine_impl_style(node): ) ) elif impl_style == "rtl": + # rtl dwc does not support every inWidth to outWidth ratio + if optype == "StreamingDataWidthConverter": + if _dwc_determine_impl_style(node) != "rtl": + warn_str = """RTL implementation of DWC requires + stream widths that are integer width ratios + from each other. Node %s will automatically be + set to HLS variant.""" % ( + node.name, + ) + warnings.warn(warn_str) + return "hls" + else: + # user setting can be fulfilled + return "rtl" if rtl_variant: return "rtl" elif hls_variant: @@ -119,6 +135,20 @@ def _determine_impl_style(node): ) +def _dwc_determine_impl_style(node): + # when possible use rtl variant + dwc = getCustomOp(node) + dwc_in_width = dwc.get_nodeattr("inWidth") + dwc_out_width = dwc.get_nodeattr("outWidth") + # check if rtl variant can be used + iwidth_d = dwc_in_width % dwc_out_width == 0 + owidth_d = dwc_out_width % dwc_in_width == 0 + if iwidth_d or owidth_d: + return "rtl" + else: + return "hls" + + class SpecializeLayers(Transformation): """Specialize all layers to either HLS or RTL variants""" diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py index 47332f069b..706b3d2065 100644 --- a/tests/fpgadataflow/test_fpgadataflow_dwc.py +++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py @@ -1,5 +1,5 @@ # Copyright (C) 2020-2022, Xilinx, Inc. -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -36,20 +36,22 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style, use_rtl_variant): +def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape) - if use_rtl_variant: - optype = "StreamingDataWidthConverter_rtl" - else: - optype = "StreamingDataWidthConverter_Batch" + optype = "StreamingDataWidthConverter" DWC_node = helper.make_node( optype, @@ -62,10 +64,6 @@ def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_styl outWidth=outWidth, dataType=str(finn_dtype.name), ) - if not use_rtl_variant: - # add additional attribute - impl_attr = helper.make_attribute("impl_style", impl_style) - DWC_node.attribute.append(impl_attr) graph = helper.make_graph(nodes=[DWC_node], name="dwc_graph", inputs=[inp], outputs=[outp]) @@ -85,39 +83,89 @@ def prepare_inputs(input_tensor, dt): @pytest.mark.parametrize( "config", [ - ([1, 24], 6, 4, DataType["INT2"], "hls"), - ([1, 24], 4, 6, DataType["INT2"], "hls"), - ([1, 4], 2, 4, DataType["BIPOLAR"], "hls"), - ([1, 2, 8], 2, 4, DataType["BIPOLAR"], "hls"), - ([1, 4], 4, 2, DataType["INT2"], "hls"), - ([1, 2, 8], 4, 4, DataType["INT2"], "hls"), - ([1, 2, 8], 8, 16, DataType["INT2"], "vivado"), + ([1, 24], 6, 4, DataType["INT2"]), + ([1, 24], 4, 6, DataType["INT2"]), + ([1, 4], 2, 4, DataType["BIPOLAR"]), + ([1, 2, 8], 2, 4, DataType["BIPOLAR"]), + ([1, 4], 4, 2, DataType["INT2"]), + ([1, 2, 8], 4, 4, DataType["INT2"]), + ([1, 2, 8], 8, 16, DataType["INT2"]), ], ) -@pytest.mark.parametrize("use_rtl_variant", [0, 1]) +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_dwc_rtlsim(config, use_rtl_variant): - shape, inWidth, outWidth, finn_dtype, impl_style = config - - if use_rtl_variant: - iwidth_d = inWidth % outWidth == 0 - owidth_d = outWidth % inWidth == 0 - if not (iwidth_d or owidth_d): - pytest.skip("RTL variant only supports stream widths that are divisible by int ratios") +def test_fpgadataflow_dwc_rtlsim(config, exec_mode): + shape, inWidth, outWidth, finn_dtype = config + + test_fpga_part = "xc7z020clg400-1" + # generate input data + x = gen_finn_dt_tensor(finn_dtype, shape) + input_dict = prepare_inputs(x, finn_dtype) + + model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype) + # verify abstraction level execution + y = oxe.execute_onnx(model, input_dict)["outp"] + assert ( + y == x + ).all(), """The output values are not the same as the + input values anymore.""" + assert y.shape == tuple(shape), """The output shape is incorrect.""" + + model = model.transform(SpecializeLayers()) + if exec_mode == "cppsim": + if model.graph.node[0].op_type == "StreamingDataWidthConverter_rtl": + pytest.skip("cppsim not supported for RTL DWC") + else: + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareRTLSim()) + y = oxe.execute_onnx(model, input_dict)["outp"] + + assert ( + y == x + ).all(), """The output values are not the same as the + input values anymore.""" + assert y.shape == tuple(shape), """The output shape is incorrect.""" + + +@pytest.mark.parametrize( + "config", + [ + ([1, 24], 6, 4, DataType["INT2"]), + ([1, 24], 4, 6, DataType["INT2"]), + ([1, 4], 2, 4, DataType["BIPOLAR"]), + ([1, 2, 8], 2, 4, DataType["BIPOLAR"]), + ([1, 4], 4, 2, DataType["INT2"]), + ([1, 2, 8], 4, 4, DataType["INT2"]), + ([1, 2, 8], 8, 16, DataType["INT2"]), + ], +) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_dwc_stitched_rtlsim(config): + shape, inWidth, outWidth, finn_dtype = config + test_fpga_part = "xc7z020clg400-1" target_clk_ns = 10.0 # generate input data x = gen_finn_dt_tensor(finn_dtype, shape) input_dict = prepare_inputs(x, finn_dtype) - model = make_single_dwc_modelwrapper( - shape, inWidth, outWidth, finn_dtype, impl_style, use_rtl_variant - ) + model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype) + model = model.transform(SpecializeLayers()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(test_fpga_part, 5)) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") From 50c795f3db055e7c7149655221e05a3f9761c7a5 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 15 Jan 2024 14:21:11 +0000 Subject: [PATCH 030/291] [CustomOp] Initial draft of downsampler in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 2 +- .../custom_op/fpgadataflow/downsampler.py | 251 ++++-------------- .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../fpgadataflow/hls/downsampler_hls.py | 244 +++++++++++++++++ .../fpgadataflow/convert_to_hw_layers.py | 119 +++++++++ .../test_fpgadataflow_downsampler.py | 10 +- 6 files changed, 428 insertions(+), 200 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index e4b645bbc2..157dfa5c53 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -68,7 +68,6 @@ # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure -custom_op["DownSampler"] = DownSampler custom_op["MatrixVectorActivation"] = MatrixVectorActivation custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D @@ -87,6 +86,7 @@ custom_op["FMPadding"] = FMPadding custom_op["AddStreams"] = AddStreams custom_op["ChannelwiseOp"] = ChannelwiseOp +custom_op["DownSampler"] = DownSampler custom_op["DuplicateStreams"] = DuplicateStreams custom_op["GlobalAccPool"] = GlobalAccPool custom_op["LabelSelect"] = LabelSelect diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py index e2cea6da6b..4f919d1b50 100644 --- a/src/finn/custom_op/fpgadataflow/downsampler.py +++ b/src/finn/custom_op/fpgadataflow/downsampler.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,16 +27,18 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np -import os import warnings +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class DownSampler(HLSCustomOp): - """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function. +class DownSampler(HWCustomOp): + """Abstraction layer for HW implementation of DownSampling Basically performs a down sampling of the image removing rows and columns.""" def __init__(self, onnx_node, **kwargs): @@ -174,197 +176,54 @@ def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - ifm_ch = self.get_nodeattr("NumChannels") - self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] - - ibits = self.get_input_datatype().bitwidth() - self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] - - idim = self.get_nodeattr("ImgDim") - self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] - - simd = self.get_nodeattr("SIMD") - self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(simd)] - - stride = self.get_nodeattr("Stride") - self.code_gen_dict["$DEFINES$"] += ["#define Stride {}".format(stride)] - - batch_size = self.get_nodeattr("numInputVectors") - self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D" - sname = self.hls_sname() - self.code_gen_dict["$DOCOMPUTE$"] = [ - f"""ConvolutionInputGenerator_{dim_var}_kernel1 (in0_{sname}, out_{sname}, numReps);""" - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" - % ( - self.onnx_node.name, - packed_hls_type, - self.hls_sname(), - packed_hls_type, - self.hls_sname(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # using Im2Col node to calculate output node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + ifm_dim = self.get_nodeattr("ImgDim") + stride = self.get_nodeattr("Stride") + ifm_ch = self.get_nodeattr("NumChannels") + # check if 1D or 2D case + if self.get_nodeattr("is1D"): + if self.get_nodeattr("is1D_unitx"): + ifm_dim_w = 1 + sw = 1 + ifm_dim_h = ifm_dim + sh = stride + else: + ifm_dim_h = 1 + sh = 1 + ifm_dim_w = ifm_dim + sw = stride else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + ifm_dim_h = ifm_dim_w = ifm_dim + sh = sw = stride + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + im2col_node = helper.make_node( + "Im2Col", + [node.input[0]], + [node.output[0]], + domain="qonnx.custom_op.general", + stride=[sh, sw], + kernel_size=[1, 1], + input_shape="(1,{},{},{})".format(ifm_dim_h, ifm_dim_w, ifm_ch), + ) + graph_im2col = helper.make_graph( + nodes=[im2col_node], + name="single-im2col-exec", + inputs=[inp], + outputs=[outp], + ) - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim, OutputDim, NumChannels).""" + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_im2col = ModelWrapper(qonnx_make_model(graph_im2col, **onnx_kwargs)) + model_im2col.set_tensor_datatype(node.input[0], self.get_input_datatype()) + # use execution function from Im2Col node + # this automatically updates the execution context + inst = getCustomOp(im2col_node) + inst.execute_node(context, model_im2col.graph) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 1803b00023..8b1ca6e719 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -28,6 +28,7 @@ from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls +from finn.custom_op.fpgadataflow.hls.downsampler_hls import DownSampler_hls from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls @@ -46,6 +47,7 @@ # registered and plug in correctly into the infrastructure custom_op["AddStreams_hls"] = AddStreams_hls custom_op["ChannelwiseOp_hls"] = ChannelwiseOp_hls +custom_op["DownSampler_hls"] = DownSampler_hls custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls custom_op["FMPadding_hls"] = FMPadding_hls custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py new file mode 100644 index 0000000000..d5bd0877a4 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py @@ -0,0 +1,244 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.downsampler import DownSampler +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class DownSampler_hls(DownSampler, HLSBackend): + """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function. + Basically performs a down sampling of the image removing rows and columns.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(DownSampler.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("NumChannels") + self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] + + ibits = self.get_input_datatype().bitwidth() + self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] + + idim = self.get_nodeattr("ImgDim") + self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] + + simd = self.get_nodeattr("SIMD") + self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(simd)] + + stride = self.get_nodeattr("Stride") + self.code_gen_dict["$DEFINES$"] += ["#define Stride {}".format(stride)] + + batch_size = self.get_nodeattr("numInputVectors") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D" + sname = self.hls_sname() + self.code_gen_dict["$DOCOMPUTE$"] = [ + f"""ConvolutionInputGenerator_{dim_var}_kernel1 (in0_{sname}, out_{sname}, numReps);""" + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim, OutputDim, NumChannels).""" diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 16ed2cfd9a..a65c925f97 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -41,6 +41,125 @@ from qonnx.util.onnx import nchw_to_nhwc +class InferConvInpGen(Transformation): + """Convert Im2Col layers to ConvolutionInputGenerator layers.""" + + def __init__(self, use_rtl_variant=False): + super().__init__() + self.use_rtl_variant = use_rtl_variant + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "Im2Col": + i2c_input = n.input[0] + i2c_output = n.output[0] + i2c_in_shape = model.get_tensor_shape(i2c_input) + dt = model.get_tensor_datatype(i2c_input) + if not dt.is_integer(): + warnings.warn("%s : Input is not int. Can't infer ConvInpGen." % n.name) + continue + i2c_inst = getCustomOp(n) + stride_h, stride_w = i2c_inst.get_nodeattr("stride") + k_h, k_w = i2c_inst.get_nodeattr("kernel_size") + pad_attr = i2c_inst.get_nodeattr("pad_amount") + pad_h = pad_attr[0] + pad_attr[2] + pad_w = pad_attr[1] + pad_attr[3] + dilation_h, dilation_w = i2c_inst.get_nodeattr("dilations") + # temporary checks until non-square conv support is finalized + pad_val = i2c_inst.get_nodeattr("pad_value") + ifm_ch = i2c_in_shape[-1] + ifm_dim_h = i2c_in_shape[1] + ifm_dim_w = i2c_in_shape[2] + + # default params for ConvolutionInputGenerator + ConvInpGen_node_idx = node_ind + ConvInpGen_input = i2c_input + ConvInpGen_idim_h = ifm_dim_h + ConvInpGen_idim_w = ifm_dim_w + + if pad_h > 0 or pad_w > 0: + assert pad_val == 0, ( + "%s : FMPadding_Batch doesn't currently support pad_val!= 0" % n.name + ) + + odim_padding_h = ifm_dim_h + pad_h + odim_padding_w = ifm_dim_w + pad_w + + padding_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, odim_padding_h, odim_padding_w, ifm_ch), + ) + graph.value_info.append(padding_out) + padding_out = padding_out.name + model.set_tensor_datatype(padding_out, dt) + + ConvInpGen_node_idx += 1 + ConvInpGen_input = padding_out + ConvInpGen_idim_h = odim_padding_h + ConvInpGen_idim_w = odim_padding_w + + padding_node = helper.make_node( + "FMPadding", + [i2c_input], + [padding_out], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ImgDim=[ifm_dim_h, ifm_dim_w], + Padding=pad_attr, + NumChannels=ifm_ch, + inputDataType=dt.name, + SIMD=ifm_ch, + name="FMPadding_Batch_" + n.name, + ) + graph.node.insert(node_ind, padding_node) + + is_kernel_pointwise = k_h == 1 and k_w == 1 + is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w + is_equal_stride = stride_h == stride_w + + # Ensure that only supported HLS nodes are inserted + if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: + downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) + is1D_unitx = ifm_dim_w == 1 + downsample_2D = (not downsample_1D) and is_square_image and is_equal_stride + if not (downsample_1D or downsample_2D): + warnings.warn(f"Couldn't infer Downsample from {n.name},check config.") + continue + ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w) + stride = max(stride_h, stride_w) + # create DownSampler node + ConvInpGen_node = helper.make_node( + "DownSampler", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ImgDim=ConvInpGen_idim, + NumChannels=ifm_ch, + SIMD=ifm_ch, + Stride=stride, + inputDataType=dt.name, + name="DownSampler_" + n.name, + is1D=downsample_1D, + is1D_unitx=is1D_unitx, + ) + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) + else: + continue + # remove old nodes + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferUpsample(Transformation): """Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour nodes.""" diff --git a/tests/fpgadataflow/test_fpgadataflow_downsampler.py b/tests/fpgadataflow/test_fpgadataflow_downsampler.py index 8a3c1fe682..25717a4152 100644 --- a/tests/fpgadataflow/test_fpgadataflow_downsampler.py +++ b/tests/fpgadataflow/test_fpgadataflow_downsampler.py @@ -39,7 +39,7 @@ from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul from qonnx.util.basic import gen_finn_dt_tensor -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim @@ -48,6 +48,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=False): @@ -126,8 +127,11 @@ def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode): inp = gen_finn_dt_tensor(dt_in, model.get_tensor_shape("in0")) idict = {"in0": inp} y_expected = execute_onnx(model, idict)["out0"] - model = model.transform(to_hls.InferConvInpGen()) + model = model.transform(to_hw.InferConvInpGen()) assert len(model.get_nodes_by_op_type("DownSampler")) == 1 + y_produced = execute_onnx(model, idict)["out0"] + assert (y_produced == y_expected).all() + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) @@ -143,7 +147,7 @@ def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode): y_produced = execute_onnx(model, idict)["out0"] assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("DownSampler")[0] + node = model.get_nodes_by_op_type("DownSampler_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) From e3ab5fcc49638f7dc2707ab8bb4f9d0337bc48f0 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Mon, 15 Jan 2024 16:12:48 +0000 Subject: [PATCH 031/291] [BTS] Binary Thresholding Search base Applying the original BTS pull request #687 by fionnodonohoe-xlnx to add updates and bug fixes --- .../finn.custom_op.fpgadataflow.rst | 8 + finn-rtllib/thresholding/component.xml | 1002 +++++++++++++++++ .../gui/thresholding_axi_v1_0.gtcl | 4 + finn-rtllib/thresholding/hdl/axilite_if.v | 210 ++++ finn-rtllib/thresholding/hdl/thresholding.sv | 358 ++++++ .../thresholding/hdl/thresholding_axi.sv | 164 +++ .../hdl/thresholding_template_wrapper.v | 120 ++ finn-rtllib/thresholding/sim/thresh_gen.sv | 45 + finn-rtllib/thresholding/sim/thresholding.tcl | 17 + .../thresholding/sim/thresholding_axi_tb.sv | 314 ++++++ .../thresholding/sim/thresholding_tb.sv | 274 +++++ .../xgui/thresholding_axi_v1_0.tcl | 187 +++ src/finn/custom_op/fpgadataflow/__init__.py | 4 + .../thresholding_binary_search.py | 579 ++++++++++ .../fpgadataflow/convert_to_hls_layers.py | 81 +- src/finn/util/basic.py | 19 + .../test_convert_to_hls_thresholding.py | 276 +++++ ...fpgadataflow_thresholding_binary_search.py | 287 +++++ tests/util/test_basic.py | 60 + 19 files changed, 3988 insertions(+), 21 deletions(-) create mode 100644 finn-rtllib/thresholding/component.xml create mode 100644 finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl create mode 100644 finn-rtllib/thresholding/hdl/axilite_if.v create mode 100644 finn-rtllib/thresholding/hdl/thresholding.sv create mode 100644 finn-rtllib/thresholding/hdl/thresholding_axi.sv create mode 100644 finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v create mode 100644 finn-rtllib/thresholding/sim/thresh_gen.sv create mode 100644 finn-rtllib/thresholding/sim/thresholding.tcl create mode 100644 finn-rtllib/thresholding/sim/thresholding_axi_tb.sv create mode 100644 finn-rtllib/thresholding/sim/thresholding_tb.sv create mode 100644 finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl create mode 100755 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py create mode 100755 tests/fpgadataflow/test_convert_to_hls_thresholding.py create mode 100755 tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py create mode 100755 tests/util/test_basic.py diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst index fdcf44c6d9..3627855cfb 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst @@ -203,6 +203,14 @@ finn.custom\_op.fpgadataflow.thresholding\_batch :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.thresholding\_binary\_search +----------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.thresholding_binary_search + :members: + :undoc-members: + :show-inheritance: + finn.custom\_op.fpgadataflow.tlastmarker ----------------------------------------------- diff --git a/finn-rtllib/thresholding/component.xml b/finn-rtllib/thresholding/component.xml new file mode 100644 index 0000000000..e28a3a2c2d --- /dev/null +++ b/finn-rtllib/thresholding/component.xml @@ -0,0 +1,1002 @@ + + + amd.com + finn + thresholding_axi + 1.0 + + + ap_clk + + + + + + + CLK + + + ap_clk + + + + + + ASSOCIATED_RESET + ap_rst_n + + + ASSOCIATED_BUSIF + s_axilite:s_axis:m_axis + + + FREQ_TOLERANCE_HZ + -1 + + + + + m_axis + + + + + + + TDATA + + + m_axis_tdata + + + + + TVALID + + + m_axis_tvalid + + + + + TREADY + + + m_axis_tready + + + + + + s_axis + + + + + + + TDATA + + + s_axis_tdata + + + + + TVALID + + + s_axis_tvalid + + + + + TREADY + + + s_axis_tready + + + + + + s_axilite + + + + + + + + + AWADDR + + + s_axilite_AWADDR + + + + + AWVALID + + + s_axilite_AWVALID + + + + + AWREADY + + + s_axilite_AWREADY + + + + + WDATA + + + s_axilite_WDATA + + + + + WSTRB + + + s_axilite_WSTRB + + + + + WVALID + + + s_axilite_WVALID + + + + + WREADY + + + s_axilite_WREADY + + + + + BRESP + + + s_axilite_BRESP + + + + + BVALID + + + s_axilite_BVALID + + + + + BREADY + + + s_axilite_BREADY + + + + + ARADDR + + + s_axilite_ARADDR + + + + + ARVALID + + + s_axilite_ARVALID + + + + + ARREADY + + + s_axilite_ARREADY + + + + + RDATA + + + s_axilite_RDATA + + + + + RRESP + + + s_axilite_RRESP + + + + + RVALID + + + s_axilite_RVALID + + + + + RREADY + + + s_axilite_RREADY + + + + + + ap_rst_n + + + + + + + RST + + + ap_rst_n + + + + + + POLARITY + ACTIVE_LOW + + + + + + + s_axilite + s_axilite + + reg0 + reg0 + 0x0 + 4096 + 32 + register + + + + + + + xilinx_anylanguagesynthesis + Synthesis + :vivado.xilinx.com:synthesis + Verilog + thresholding_axi_wrapper + + xilinx_anylanguagesynthesis_view_fileset + + + + viewChecksum + fd0bd85b + + + + + xilinx_anylanguagebehavioralsimulation + Simulation + :vivado.xilinx.com:simulation + Verilog + thresholding_axi_wrapper + + xilinx_anylanguagebehavioralsimulation_view_fileset + + + + viewChecksum + fd0bd85b + + + + + xilinx_xpgui + UI Layout + :vivado.xilinx.com:xgui.ui + + xilinx_xpgui_view_fileset + + + + viewChecksum + fc6b9b63 + + + + + xilinx_utilityxitfiles + Utility XIT/TTCL + :vivado.xilinx.com:xit.util + + xilinx_utilityxitfiles_view_fileset + + + + viewChecksum + 8b0215cd + + + + + + + ap_clk + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + ap_rst_n + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_AWVALID + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_AWREADY + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_AWADDR + + in + + 5 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_WVALID + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_WREADY + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_WDATA + + in + + 31 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_WSTRB + + in + + 3 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 1 + + + + + s_axilite_BVALID + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_BREADY + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_BRESP + + out + + 1 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_ARVALID + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_ARREADY + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_ARADDR + + in + + 5 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_RVALID + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_RREADY + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_RDATA + + out + + 31 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_RRESP + + out + + 1 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axis_tready + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axis_tvalid + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axis_tdata + + in + + 15 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + m_axis_tready + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 1 + + + + + m_axis_tvalid + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + m_axis_tdata + + out + + 7 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + + + N + N + 4 + + + K + K + 16 + + + C + C + 1 + + + PE + Pe + 1 + + + SIGNED + Signed + true + + + FPARG + Fparg + false + + + BIAS + Bias + 0 + + + CF + Cf + 1 + + + ADDR_BITS + Addr Bits + 6 + + + O_BITS + O Bits + 4 + + + + + + choice_list_9d8b0d81 + ACTIVE_HIGH + ACTIVE_LOW + + + + + xilinx_anylanguagesynthesis_view_fileset + + hdl/thresholding.sv + systemVerilogSource + + + hdl/thresholding_axi.sv + systemVerilogSource + + + hdl/thresholding_axi_wrapper.v + verilogSource + CHECKSUM_7b8c102d + + + hdl/axilite_if.v + verilogSource + CHECKSUM_69d1ba26 + xil_defaultlib + + + + xilinx_anylanguagebehavioralsimulation_view_fileset + + hdl/thresholding.sv + systemVerilogSource + + + hdl/thresholding_axi.sv + systemVerilogSource + + + hdl/thresholding_axi_wrapper.v + verilogSource + + + hdl/axilite_if.v + verilogSource + USED_IN_ipstatic + xil_defaultlib + + + + xilinx_xpgui_view_fileset + + xgui/thresholding_axi_v1_0.tcl + tclSource + CHECKSUM_fc6b9b63 + XGUI_VERSION_2 + + + + xilinx_utilityxitfiles_view_fileset + + gui/thresholding_axi_v1_0.gtcl + GTCL + + + + MultiThreshold + + + N + Output Precision + 4 + + + K + Input Precision + 16 + + + C + Channels + 1 + + + PE + Pe + 1 + + + SIGNED + Signed Inputs + true + + + FPARG + Floating-Point Inputs + false + + + BIAS + Bias + 0 + + + CF + Channel Fold + 1 + + + + false + + + + + + ADDR_BITS + Address Bits + 6 + + + + false + + + + + + O_BITS + Output Value Width + 4 + + + + false + + + + + + Component_Name + thresholding_axi_wrapper_v1_0 + + + + + + virtex7 + qvirtex7 + versal + kintex7 + kintex7l + qkintex7 + qkintex7l + akintex7 + artix7 + artix7l + aartix7 + qartix7 + zynq + qzynq + azynq + spartan7 + aspartan7 + virtexu + zynquplus + virtexuplus + virtexuplusHBM + virtexuplus58g + kintexuplus + artixuplus + kintexu + + + /UserIP + + thresholding_axi + level_1 + package_project + 2 + + user.org:user:thresholding_axi_wrapper:1.0 + + 2023-06-27T05:47:20Z + + + + + + 2022.2 + + + + + + + + + + + + + + diff --git a/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl b/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl new file mode 100644 index 0000000000..90d73ede7e --- /dev/null +++ b/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl @@ -0,0 +1,4 @@ +# This file is automatically written. Do not modify. +proc gen_USERPARAMETER_CF_VALUE {C PE } {expr $C/$PE} +proc gen_USERPARAMETER_ADDR_BITS_VALUE {C PE N } {expr int(ceil(log($C/$PE)/log(2))+ceil(log($PE)/log(2))+$N+2)} +proc gen_USERPARAMETER_O_BITS_VALUE {BIAS N } {expr int(ceil($BIAS >= 0? log(pow(2,$N)+$BIAS)/log(2) : 1+log(-$BIAS >= pow(2,$N-1)? -$BIAS : pow(2,$N)+$BIAS)/log(2)))} diff --git a/finn-rtllib/thresholding/hdl/axilite_if.v b/finn-rtllib/thresholding/hdl/axilite_if.v new file mode 100644 index 0000000000..bdd4de288e --- /dev/null +++ b/finn-rtllib/thresholding/hdl/axilite_if.v @@ -0,0 +1,210 @@ +/* + Copyright (c) 2020, Xilinx + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of FINN nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +module axi4lite_if +#( + parameter ADDR_WIDTH = 32, + parameter DATA_WIDTH = 32,//AXI4 spec requires this to be strictly 32 or 64 + parameter IP_DATA_WIDTH = 64//can be any power-of-2 multiple of DATA_WIDTH +) +( +//system signals +input aclk, +input aresetn,//active low, asynchronous assertion and synchronous deassertion + +//Write channels +//write address +output reg awready, +input awvalid, +input [ADDR_WIDTH-1:0] awaddr, +input [2:0] awprot, +//write data +output reg wready, +input wvalid, +input [DATA_WIDTH-1:0] wdata, +input [(DATA_WIDTH/8)-1:0] wstrb, +//burst response +input bready, +output reg bvalid, +output reg [1:0] bresp,//NOTE: 00 = OKAY, 10 = SLVERR (write error) + +//Read channels +//read address +output reg arready, +input arvalid, +input [ADDR_WIDTH-1:0] araddr, +input [2:0] arprot, +//read data +input rready, +output reg rvalid, +output reg [1:0] rresp,//NOTE: 00 = OKAY, 10 = SLVERR (read error) +output reg [DATA_WIDTH-1:0] rdata, + +//IP-side interface +output reg ip_en, +output reg ip_wen, +output reg [ADDR_WIDTH-1:0] ip_addr, +output [IP_DATA_WIDTH-1:0] ip_wdata, +input ip_rack, +input [IP_DATA_WIDTH-1:0] ip_rdata +); + +localparam RESP_OKAY = 2'b00; +localparam RESP_SLVERR = 2'b10; +//get ceil(log2(ceil(IP_DATA_WIDTH/DATA_WIDTH))) +localparam NFOLDS_LOG = $clog2((IP_DATA_WIDTH + DATA_WIDTH - 1) / DATA_WIDTH); + +reg internal_ren; +reg internal_wen; +reg internal_wack; +reg [ADDR_WIDTH-1:0] internal_raddr; +reg [ADDR_WIDTH-1:0] internal_waddr; +reg [DATA_WIDTH-1:0] internal_wdata; +wire [DATA_WIDTH-1:0] internal_rdata; +reg internal_error = 0; + +//check DATA_WIDTH +initial begin + if(DATA_WIDTH != 32 & DATA_WIDTH != 64) begin + $display("AXI4Lite DATA_WIDTH must be 32 or 64"); + $finish; + end +end + +//transaction state machine +localparam STATE_IDLE = 0, + STATE_READ = 1, + STATE_WRITE = 2; + +reg [1:0] state; + +always @(posedge aclk or negedge aresetn) + if(~aresetn) + state <= STATE_IDLE; + else case(state) + STATE_IDLE: + if(awvalid & wvalid) + state <= STATE_WRITE; + else if(arvalid) + state <= STATE_READ; + STATE_READ: + if(rvalid & rready) + state <= STATE_IDLE; + STATE_WRITE: + if(bvalid & bready) + state <= STATE_IDLE; + default: state <= STATE_IDLE; + endcase + +//write-related internal signals +always @(*) begin + internal_waddr = awaddr >> $clog2(DATA_WIDTH/8); + internal_wdata = wdata; + internal_wen = (state == STATE_IDLE) & awvalid & wvalid; +end + +always @(posedge aclk) begin + awready <= internal_wen; + wready <= internal_wen; +end + +//read-related internal signals +always @(*) begin + internal_raddr = araddr >> $clog2(DATA_WIDTH/8); + internal_ren = (state == STATE_IDLE) & ~internal_wen & arvalid; +end + +always @(posedge aclk) + arready <= internal_ren; + +wire write_to_last_fold; + +always @(posedge aclk) begin + ip_wen <= write_to_last_fold; + ip_en <= internal_ren | write_to_last_fold; + if(internal_ren | write_to_last_fold) + ip_addr <= internal_ren ? (internal_raddr >> NFOLDS_LOG) : (internal_waddr >> NFOLDS_LOG); + internal_wack <= internal_wen; +end + +genvar i; +reg [(1<> (internal_rfold*DATA_WIDTH); + always @(posedge aclk) + if(internal_ren) + internal_rfold <= internal_raddr[NFOLDS_LOG-1:0]; + for(i=0; i<(1< + * + * @description + * Produces the N-bit count of those among 2^N-1 thresholds that are not + * larger than the corresponding input: + * y = Σ(T_i <= x) + * The result is computed by binary search. The runtime-configurable + * thresholds must be written in ascending order: + * i < j => T_i < T_j + * The design supports channel folding allowing each input to be processed + * with respect to a selectable set of thresholds. The corresponding + * threshold configuration relies on a channel address prefix. Inputs are + * accompanied by a channel selector. + * + * Parameter Layout as seen on AXI-Lite (row by row): + * | Base \ Offs | 0 1 2 ... 2^N-2 2^N-1 + * ---------+--------------------------------+------------------------------------ + * Chnl #0 | 0 | T_0 T_1 T_2 ... T_{2^N-2} 'x + * Chnl #1 | 2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x + * Chnl #c | ((c/PE)*$clog2(PE) + c%PE)*2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x + * + *****************************************************************************/ +module thresholding #( + int unsigned N, // output precision + int unsigned K, // input/threshold precision + int unsigned C, // number of channels + int unsigned PE, // parallel processing elements + + bit SIGNED = 1, // signed inputs + bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + // Initial Thresholds + parameter THRESHOLDS_PATH = "", + bit USE_CONFIG = 1, + + // Force Use of On-Chip Memory Blocks + int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio) + int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM + bit DEEP_PIPELINE = 0, + + localparam int unsigned CF = C/PE, // Channel fold + localparam int unsigned O_BITS = BIAS >= 0? + /* unsigned */ $clog2(2**N+BIAS) : + /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) +)( + // Global Control + input logic clk, + input logic rst, + + // Threshold Configuration + input logic cfg_en, + input logic cfg_we, + input logic [$clog2(CF)+$clog2(PE)+N-1:0] cfg_a, + input logic [K-1:0] cfg_d, + output logic cfg_rack, + output logic [K-1:0] cfg_q, + + // Input Stream + output logic irdy, + input logic ivld, + input logic [PE-1:0][K-1:0] idat, + + // Output Stream + input logic ordy, + output logic ovld, + output logic [PE-1:0][O_BITS-1:0] odat +); + + // Parameter Constraints Checking + initial begin + if(CF*PE != C) begin + $error("Parallelism PE=%0d is not a multiple of channel count C=%0d.", PE, C); + $finish; + end + end + + // Operations within Pipeline + typedef enum logic [1:0] { + NOP = 2'b00, // No operation + TH = 2'b01, // Thresholding + WR = 2'b11, // Write (initialization) + RB = 2'b10, // Readback (validation) + CFG = 2'b1x // Config op (pointer-preserving) + } op_e; + + // Pipeline Link Type + typedef logic [$clog2(CF)+N-1:0] ptr_t; + typedef logic [K -1:0] val_t; + typedef struct packed { + op_e op; + ptr_t ptr; // WR/RB: address; TH: result + val_t val; // WR/RB: threshold value; TH: input value + } pipe_t; + + //----------------------------------------------------------------------- + // Pipeline Feed + // - configuration always takes precedence + // - number of pending thresholding ops capped to N+3 + // across pipeline and output FIFO: pipe:N + A:1 + B:1 + 1 + localparam int unsigned MAX_PENDING = (DEEP_PIPELINE+1)*N + 3; + pipe_t pipe[PE][N+1]; + if(1) begin : blkFeed + + // Thresholding Input Guard ensuring Output FIFO is never overrun + logic signed [$clog2(MAX_PENDING):0] GuardSem = MAX_PENDING-1; // MAX_PENDING-1, ..., 0, -1 + uwire th_full = GuardSem[$left(GuardSem)]; + always_ff @(posedge clk) begin + if(rst) GuardSem <= MAX_PENDING-1; + else begin + automatic logic dec = !(USE_CONFIG && cfg_en) && !th_full && ivld; + automatic logic inc = ovld && ordy; + GuardSem <= GuardSem + (inc == dec? 0 : inc? 1 : -1); + end + end + + // PE Configuration Address Decoding + uwire cfg_sel[PE]; + if(PE == 1) assign cfg_sel[0] = 1; + else begin + for(genvar pe = 0; pe < PE; pe++) begin + assign cfg_sel[pe] = USE_CONFIG && cfg_en && (cfg_a[N+:$clog2(PE)] == pe); + end + end + + uwire ptr_t iptr; + assign iptr[0+:N] = cfg_a[0+:N]; + if(CF > 1) begin + // Channel Fold Rotation + logic [$clog2(CF)-1:0] CnlCnt = 0; + logic CnlLst = 0; + always_ff @(posedge clk) begin + if(rst) begin + CnlCnt <= 0; + CnlLst <= 0; + end + else if(!(USE_CONFIG && cfg_en) && !th_full && ivld) begin + CnlCnt <= CnlCnt + (CnlLst? 1-CF : 1); + CnlLst <= CnlCnt == CF-2; + end + end + + assign iptr[N+:$clog2(CF)] = USE_CONFIG && cfg_en? cfg_a[N+$clog2(PE)+:$clog2(CF)] : CnlCnt; + end + + for(genvar pe = 0; pe < PE; pe++) begin + assign pipe[pe][0] = '{ + op: USE_CONFIG && cfg_en? + (!cfg_sel[pe]? NOP : cfg_we? WR : RB) : + (ivld && !th_full? TH : NOP), + ptr: iptr, + val: !(USE_CONFIG && cfg_en)? idat[pe] : cfg_we? cfg_d : 0 + }; + end + + assign irdy = !(USE_CONFIG && cfg_en) && !th_full; + end : blkFeed + + //----------------------------------------------------------------------- + // Free-Running Thresholding Pipeline + for(genvar stage = 0; stage < N; stage++) begin : genStages + + localparam int unsigned SN = N-1-stage; + for(genvar pe = 0; pe < PE; pe++) begin : genPE + uwire pipe_t p = pipe[pe][stage]; + uwire cs = (p.ptr[SN:0] == 2**SN-1); + + // Threshold Memory + val_t Thresh; // Read-out register + if(1) begin : blkThresh + localparam int unsigned DEPTH = CF * 2**stage; + localparam RAM_STYLE = + DEPTH_TRIGGER_URAM && (DEPTH >= DEPTH_TRIGGER_URAM)? "ultra" : + DEPTH_TRIGGER_BRAM && (DEPTH >= DEPTH_TRIGGER_BRAM)? "block" : + // If BRAM trigger defined, force distributed memory below if Vivado may be tempted to use BRAM nonetheless. + DEPTH_TRIGGER_BRAM && (DEPTH >= 64)? "distributed" : "auto"; + + (* RAM_STYLE = RAM_STYLE *) + val_t Threshs[DEPTH]; + if(THRESHOLDS_PATH != "") begin + localparam FILE = $sformatf("%s/threshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage); + initial $readmemh(FILE, Threshs); + end + + if(USE_CONFIG) begin : genThreshMem + uwire we = (p.op ==? WR) && cs; + if((CF == 1) && (stage == 0)) begin + always @(posedge clk) begin + if(we) Threshs[0] <= p.val; + end + end + else begin + uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1]; + always @(posedge clk) begin + if(we) Threshs[addr] <= p.val; + end + end + end : genThreshMem + + if((CF == 1) && (stage == 0)) begin + assign Thresh = Threshs[0]; + end + else begin + uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1]; + always_ff @(posedge clk) begin + Thresh <= Threshs[addr]; + end + end + + end : blkThresh + + // Pipeline State + pipe_t P = '{ op: NOP, default: 'x }; + logic Reval = 0; + always_ff @(posedge clk) begin + if(rst) begin + P <= '{ op: NOP, default: 'x }; + Reval <= 0; + end + else begin + P <= p; + Reval <= (p.op ==? RB) && cs; + end + end + + logic cmp; + if(!SIGNED) assign cmp = $unsigned(Thresh) <= $unsigned(P.val); + else if(!FPARG) assign cmp = $signed(Thresh) <= $signed(P.val); + else begin : blkSignedFloat + uwire mag_eq = Thresh[K-2:0] == P.val[K-2:0]; + uwire mag_le = Thresh[K-2:0] <= P.val[K-2:0]; + always_comb begin + unique case({Thresh[K-1], P.val[K-1]}) + 2'b00: cmp = mag_le; + 2'b01: cmp = 0; + 2'b10: cmp = 1; + 2'b11: cmp = !mag_le || mag_eq; + default: cmp = 'x; + endcase + end + end : blkSignedFloat + + // Pipeline State Update + pipe_t pp; + always_comb begin + pp = P; + if(P.op !=? CFG) pp.ptr[SN] = cmp; + if(Reval) pp.val = Thresh; + end + + // Pipeline State Forward (potentially additional register) + pipe_t pf; + if(!DEEP_PIPELINE) assign pf = pp; + else begin + pipe_t Pf = '{ op: NOP, default: 'x }; + always_ff @(posedge clk) begin + if(rst) Pf <= '{ op: NOP, default: 'x }; + else Pf <= pp; + end + assign pf = Pf; + end + + assign pipe[pe][stage+1] = pf; + + end : genPE + end : genStages + + //----------------------------------------------------------------------- + // Configuration Readback + always_comb begin + cfg_rack = 0; + cfg_q = 0; + foreach(pipe[pe]) begin + automatic pipe_t p = pipe[pe][N]; + cfg_rack |= p.op ==? RB; + cfg_q |= p.val; + end + end + + //----------------------------------------------------------------------- + // Stream Output through FIFO + // - Depth of N + Output Reg to allow pipe to drain entirely under backpressure + // - Typically mapped to an SRL shift register + if(1) begin : blkStreamOutput + localparam int unsigned A_DEPTH = MAX_PENDING - 1; + logic [PE-1 : 0][N-1 : 0] ADat[A_DEPTH]; + logic signed [$clog2(A_DEPTH):0] APtr = '1; // -1, 0, 1, ..., A_DEPTH-1 + uwire avld = !APtr[$left(APtr)]; + + logic [PE-1:0][N-1:0] BDat = 'x; + logic BVld = 0; + + uwire aload = pipe[0][N].op ==? TH; + uwire bload = !BVld || ordy; + + always_ff @(posedge clk) begin + if(aload) begin + assert(APtr < $signed(A_DEPTH-1)) else begin + $error("Overrun after failing stream guard."); + $stop; + end + foreach(pipe[pe]) ADat[0][pe] <= pipe[pe][N].ptr; + for(int unsigned i = 1; i < A_DEPTH; i++) ADat[i] <= ADat[i-1]; + end + end + always_ff @(posedge clk) begin + if(rst) APtr <= '1; + else APtr <= APtr + (aload == (avld && bload)? 0 : aload? 1 : -1); + end + always_ff @(posedge clk) begin + if(rst) begin + BDat <= 'x; + BVld <= 0; + end + else if(bload) begin + BDat <= ADat[APtr]; + BVld <= avld; + end + end + + assign ovld = BVld; + for(genvar pe = 0; pe < PE; pe++) begin + assign odat[pe] = BDat[pe] + BIAS; + end + end : blkStreamOutput + +endmodule : thresholding diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv new file mode 100644 index 0000000000..1f235b9486 --- /dev/null +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -0,0 +1,164 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief All-AXI interface adapter for thresholding module. + * @author Thomas B. Preußer + * + * @description + * This AXI adapter fits the core thresholding functionality: + * - with AXI stream data interfaces with flow control + * - with implicit round-robin channel rotation as used by FINN, and + * - performs aligned byte address to parameter word address translation. + *****************************************************************************/ + +module thresholding_axi #( + int unsigned N, // output precision + int unsigned K, // input/threshold precision + int unsigned C = 1, // Channels + int unsigned PE = 1, // Processing Parallelism, requires C = k*PE + + bit SIGNED = 1, // signed inputs + bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + // Initial Thresholds + parameter THRESHOLDS_PATH = "", + + bit USE_AXILITE, // Implement AXI-Lite for threshold read/write + + // Force Use of On-Chip Memory Blocks + int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio) + int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM + bit DEEP_PIPELINE = 0, + + localparam int unsigned CF = C/PE, // Channel Fold + localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2, + localparam int unsigned O_BITS = BIAS >= 0? + /* unsigned */ $clog2(2**N+BIAS) : + /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) +)( + //- Global Control ------------------ + input logic ap_clk, + input logic ap_rst_n, + + //- AXI Lite ------------------------ + // Writing + input logic s_axilite_AWVALID, + output logic s_axilite_AWREADY, + input logic [ADDR_BITS-1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored + + input logic s_axilite_WVALID, + output logic s_axilite_WREADY, + input logic [31:0] s_axilite_WDATA, + input logic [ 3:0] s_axilite_WSTRB, + + output logic s_axilite_BVALID, + input logic s_axilite_BREADY, + output logic [1:0] s_axilite_BRESP, + + // Reading + input logic s_axilite_ARVALID, + output logic s_axilite_ARREADY, + input logic [ADDR_BITS-1:0] s_axilite_ARADDR, + + output logic s_axilite_RVALID, + input logic s_axilite_RREADY, + output logic [31:0] s_axilite_RDATA, + output logic [ 1:0] s_axilite_RRESP, + + //- AXI Stream - Input -------------- + output logic s_axis_tready, + input logic s_axis_tvalid, + input logic [((PE*K+7)/8)*8-1:0] s_axis_tdata, + + //- AXI Stream - Output ------------- + input logic m_axis_tready, + output logic m_axis_tvalid, + output logic [((PE*O_BITS+7)/8)*8-1:0] m_axis_tdata +); + + //----------------------------------------------------------------------- + // AXI-lite Configuration Interface + uwire cfg_en; + uwire cfg_we; + uwire [ADDR_BITS-3:0] cfg_a; + uwire [K -1:0] cfg_d; + uwire cfg_rack; + uwire [K -1:0] cfg_q; + + if(USE_AXILITE) begin + uwire [ADDR_BITS-1:0] cfg_a0; + axi4lite_if #(.ADDR_WIDTH(ADDR_BITS), .DATA_WIDTH(32), .IP_DATA_WIDTH(K)) axi ( + .aclk(ap_clk), .aresetn(ap_rst_n), + + .awready(s_axilite_AWREADY), .awvalid(s_axilite_AWVALID), .awaddr(s_axilite_AWADDR), .awprot('x), + .wready(s_axilite_WREADY), .wvalid(s_axilite_WVALID), .wdata(s_axilite_WDATA), .wstrb(s_axilite_WSTRB), + .bready(s_axilite_BREADY), .bvalid(s_axilite_BVALID), .bresp(s_axilite_BRESP), + + .arready(s_axilite_ARREADY), .arvalid(s_axilite_ARVALID), .araddr(s_axilite_ARADDR), .arprot('x), + .rready(s_axilite_RREADY), .rvalid(s_axilite_RVALID), .rresp(s_axilite_RRESP), .rdata(s_axilite_RDATA), + + .ip_en(cfg_en), .ip_wen(cfg_we), .ip_addr(cfg_a0), .ip_wdata(cfg_d), + .ip_rack(cfg_rack), .ip_rdata(cfg_q) + ); + assign cfg_a = cfg_a0[ADDR_BITS-3:0]; + always_ff @(posedge ap_clk) begin + assert(!ap_rst_n || !cfg_en || (cfg_a0[ADDR_BITS-2+:2] === 3'h0)) else begin + $error("%m: Spurious high address bits."); + $stop; + end + end + end + else begin + assign cfg_en = 0; + assign cfg_we = 'x; + assign cfg_a = 'x; + assign cfg_d = 'x; + end + + //----------------------------------------------------------------------- + // Kernel Implementation + thresholding #( + .N(N), .K(K), .C(C), .PE(PE), + .SIGNED(SIGNED), .FPARG(FPARG), .BIAS(BIAS), + .THRESHOLDS_PATH(THRESHOLDS_PATH), .USE_CONFIG(USE_AXILITE), + .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM), + .DEEP_PIPELINE(DEEP_PIPELINE) + ) impl ( + .clk(ap_clk), .rst(!ap_rst_n), + + .cfg_en, .cfg_we, .cfg_a, .cfg_d, + .cfg_rack, .cfg_q, + + .irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat(s_axis_tdata), + .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata) + ); + +endmodule : thresholding_axi diff --git a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v new file mode 100644 index 0000000000..3f0b012ef1 --- /dev/null +++ b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v @@ -0,0 +1,120 @@ +/** + * Copyright (c) 2023, Xilinx + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of FINN nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @author Thomas B. Preußer + * @brief Verilog wrapper for IP packaging. + */ + +module thresholding_template_wrapper #( + parameter N = $N$, // output precision + parameter K = $M$, // input/threshold precision + parameter C = $C$, // Channels + parameter PE = $PE$, // Processing Parallelism, requires C = k*PE + + parameter SIGNED = $SIGNED$, // signed inputs + parameter FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + parameter BIAS = $BIAS$, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + parameter THRESHOLDS_PATH = $THRESHOLDS_PATH$, // Directory with initial threshold data + parameter USE_AXILITE = $USE_AXILITE$, // Implement AXI-Lite for threshold read/write + + // Force Use of On-Chip Memory Blocks + parameter DEPTH_TRIGGER_URAM = $DEPTH_TRIGGER_URAM$, // if non-zero, local mems of this depth or more go into URAM (prio) + parameter DEPTH_TRIGGER_BRAM = $DEPTH_TRIGGER_BRAM$, // if non-zero, local mems of this depth or more go into BRAM + parameter DEEP_PIPELINE = $DEEP_PIPELINE$, // [bit] extra pipeline stages for easier timing closure + + parameter O_BITS = $O_BITS$ +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axilite:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + //- AXI Lite ------------------------ + // Writing + input s_axilite_AWVALID, + output s_axilite_AWREADY, + input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored + + input s_axilite_WVALID, + output s_axilite_WREADY, + input [31:0] s_axilite_WDATA, + input [ 3:0] s_axilite_WSTRB, + + output s_axilite_BVALID, + input s_axilite_BREADY, + output [1:0] s_axilite_BRESP, + + // Reading + input s_axilite_ARVALID, + output s_axilite_ARREADY, + input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_ARADDR, + + output s_axilite_RVALID, + input s_axilite_RREADY, + output [31:0] s_axilite_RDATA, + output [ 1:0] s_axilite_RRESP, + + //- AXI Stream - Input -------------- + output in0_V_tready, + input in0_V_tvalid, + input [((PE*K+7)/8)*8-1:0] in0_V_tdata, + + //- AXI Stream - Output ------------- + input out_V_tready, + output out_V_tvalid, + output [((PE*O_BITS+7)/8)*8-1:0] out_V_tdata +); + + thresholding_axi #( + .N(N), .K(K), .C(C), .PE(PE), + .SIGNED(SIGNED), + .FPARG(FPARG), + .BIAS(BIAS), + .THRESHOLDS_PATH(THRESHOLDS_PATH), + .USE_AXILITE(USE_AXILITE), + .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), + .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM), + .DEEP_PIPELINE(DEEP_PIPELINE) + ) core ( + .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), + + .s_axilite_AWVALID(s_axilite_AWVALID), .s_axilite_AWREADY(s_axilite_AWREADY), .s_axilite_AWADDR(s_axilite_AWADDR), + .s_axilite_WVALID(s_axilite_WVALID), .s_axilite_WREADY(s_axilite_WREADY), .s_axilite_WDATA(s_axilite_WDATA), .s_axilite_WSTRB(s_axilite_WSTRB), + .s_axilite_BVALID(s_axilite_BVALID), .s_axilite_BREADY(s_axilite_BREADY), .s_axilite_BRESP(s_axilite_BRESP), + + .s_axilite_ARVALID(s_axilite_ARVALID), .s_axilite_ARREADY(s_axilite_ARREADY), .s_axilite_ARADDR(s_axilite_ARADDR), + .s_axilite_RVALID(s_axilite_RVALID), .s_axilite_RREADY(s_axilite_RREADY), .s_axilite_RDATA(s_axilite_RDATA), .s_axilite_RRESP(s_axilite_RRESP), + .s_axis_tready(in0_V_tready), .s_axis_tvalid(in0_V_tvalid), .s_axis_tdata(in0_V_tdata), + .m_axis_tready(out_V_tready), .m_axis_tvalid(out_V_tvalid), .m_axis_tdata(out_V_tdata) + ); + +endmodule // thresholding_template_wrapper diff --git a/finn-rtllib/thresholding/sim/thresh_gen.sv b/finn-rtllib/thresholding/sim/thresh_gen.sv new file mode 100644 index 0000000000..a8a18be691 --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresh_gen.sv @@ -0,0 +1,45 @@ +module thresh_gen; + localparam int unsigned K = 9; + localparam int unsigned N = 4; + localparam int unsigned C = 6; + + typedef logic [K-1:0] thresh_t; + localparam thresh_t THRESHOLDS[C][2**N-1] = '{ + '{ 'h00, 'h01, 'h02, 'h03, 'h04, 'h05, 'h06, 'h07, 'h08, 'h09, 'h0a, 'h0b, 'h0c, 'h0d, 'h0e }, + '{ 'h10, 'h11, 'h12, 'h13, 'h14, 'h15, 'h16, 'h17, 'h18, 'h19, 'h1a, 'h1b, 'h1c, 'h1d, 'h1e }, + '{ 'h20, 'h21, 'h22, 'h23, 'h24, 'h25, 'h26, 'h27, 'h28, 'h29, 'h2a, 'h2b, 'h2c, 'h2d, 'h2e }, + '{ 'h30, 'h31, 'h32, 'h33, 'h34, 'h35, 'h36, 'h37, 'h38, 'h39, 'h3a, 'h3b, 'h3c, 'h3d, 'h3e }, + '{ 'h40, 'h41, 'h42, 'h43, 'h44, 'h45, 'h46, 'h47, 'h48, 'h49, 'h4a, 'h4b, 'h4c, 'h4d, 'h4e }, + '{ 'h50, 'h51, 'h52, 'h53, 'h54, 'h55, 'h56, 'h57, 'h58, 'h59, 'h5a, 'h5b, 'h5c, 'h5d, 'h5e } + }; + localparam THRESHOLDS_PATH = "."; + + localparam int unsigned PE = 2; + localparam int unsigned CF = C/PE; + + for(genvar stage = 0; stage < N; stage++) begin + localparam int unsigned SN = N-1-stage; + for(genvar pe = 0; pe < PE; pe++) begin + initial begin + automatic string file = $sformatf("%s/threshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage); + + automatic thresh_t threshs[CF * 2**stage]; + for(int unsigned c = 0; c < CF; c++) begin + for(int unsigned i = 0; i < 2**stage; i++) begin + threshs[(c << stage) + i] = THRESHOLDS[c*PE + pe][(i<<(N-stage)) + 2**SN-1]; + end + end + + $writememh(file, threshs); + end + end + end + + // Quit after running all initializers + initial begin + #1ns; + $display("Generation done."); + $finish; + end + +endmodule : thresh_gen diff --git a/finn-rtllib/thresholding/sim/thresholding.tcl b/finn-rtllib/thresholding/sim/thresholding.tcl new file mode 100644 index 0000000000..82dc59deb1 --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresholding.tcl @@ -0,0 +1,17 @@ +create_project -force thresholding thresholding.vivado -part xcvc1902-vsva2197-2MP-e-S +set_property board_part xilinx.com:vck190:part0:2.2 [current_project] + +read_verilog hdl/axilite_if.v +read_verilog -sv { hdl/thresholding.sv hdl/thresholding_axi.sv } + +set simset [current_fileset -simset] +set_property -name xsim.simulate.log_all_signals -value true -objects $simset +set_property -name xsim.simulate.runtime -value all -objects $simset +add_files -fileset $simset { sim/thresholding_tb.sv sim/thresholding_axi_tb.sv } + +foreach top { thresholding_tb thresholding_axi_tb } { + set_property top $top $simset + + launch_simulation + close_sim +} diff --git a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv new file mode 100644 index 0000000000..918f539d15 --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv @@ -0,0 +1,314 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for thresholding_axi. + * @author Monica Chiosa + * + */ + +module thresholding_axi_tb #( + int unsigned N = 4, // output precision + int unsigned C = 6, // number of channels + int unsigned PE = 2, + real M0 = 7.3, // slope of the uniform thresholding line + real B0 = 3.1, // offset of the uniform thresholding line + bit THROTTLED = 1, + + localparam int unsigned CF = C/PE, // Channel Fold + localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2 +); + + //----------------------------------------------------------------------- + // Design Geometry + + // For each channel = [0,channel): + // M_channel = M0 + CX*channel + // B_channel = B0 + CX*channel + // Input/threshold precision computed according with the maximum posible value + localparam real CX = 1.375; + localparam int unsigned K = $clog2((2**N-1)*(M0+C*CX) + (B0+C*CX)); // unused sign + magnitude + localparam int unsigned C_BITS = C < 2? 1 : $clog2(C); + + localparam int unsigned MST_STRM_WROUNDS = 503; + + typedef int unsigned threshs_t[C][2**N-1]; + function threshs_t init_thresholds(); + automatic threshs_t res; + for(int unsigned c = 0; c < C; c++) begin + automatic real m = M0 + c*CX; + automatic real b = B0 + c*CX; + foreach(res[c][i]) begin + res[c][i] = int'($ceil(m*i + b)); + end + end + return res; + endfunction : init_thresholds + localparam threshs_t THRESHS = init_thresholds(); + + //----------------------------------------------------------------------- + // Clock and Reset Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + #10ns; + @(posedge clk); + rst <= 0; + end + + //----------------------------------------------------------------------- + // DUT + logic s_axilite_AWVALID; + uwire s_axilite_AWREADY; + logic [ADDR_BITS-1:0] s_axilite_AWADDR; // lowest 2 bits (byte selectors) are ignored + logic s_axilite_WVALID; + uwire s_axilite_WREADY; + logic [ 31:0] s_axilite_WDATA; + uwire s_axilite_BVALID; + logic s_axilite_BREADY; + uwire [ 1:0] s_axilite_BRESP; + logic s_axilite_ARVALID; + uwire s_axilite_ARREADY; + logic [ADDR_BITS-1:0] s_axilite_ARADDR; + uwire s_axilite_RVALID; + uwire s_axilite_RREADY = 1; + uwire [ 31:0] s_axilite_RDATA; + uwire [ 1:0] s_axilite_RRESP; + + uwire irdy; + logic ivld; + logic [PE-1:0][K-1:0] idat; + + logic ordy = 0; + uwire ovld; + uwire [PE-1:0][N-1:0] odat; + + thresholding_axi #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(0), .USE_AXILITE(1)) dut ( + .ap_clk(clk), .ap_rst_n(!rst), + + // Configuration + .s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR, + .s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB('1), + .s_axilite_BVALID, .s_axilite_BREADY, .s_axilite_BRESP, + .s_axilite_ARVALID, .s_axilite_ARREADY, .s_axilite_ARADDR, + .s_axilite_RVALID, .s_axilite_RREADY, .s_axilite_RDATA, .s_axilite_RRESP, + + // Stream Processing + .s_axis_tready(irdy), .s_axis_tvalid(ivld), .s_axis_tdata(idat), + .m_axis_tready(ordy), .m_axis_tvalid(ovld), .m_axis_tdata(odat) + ); + + //----------------------------------------------------------------------- + // Input Stimuli + typedef logic [PE-1:0][K-1:0] input_t; + typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t; + input_t QW[$]; // Input Feed Tracing + addr_t QC[$]; + + int unsigned error_cnt = 0; + bit done = 0; + initial begin + // Report testbench details + $display("Testbench - tresholding K=%0d -> N=%0d", K, N); + for(int unsigned c = 0; c < C; c++) begin + $write("Channel #%0d: Thresholds = {", c); + for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0d", THRESHS[c][i]); + $display(" }"); + end + + // Config + s_axilite_AWVALID = 0; + s_axilite_AWADDR = 'x; + s_axilite_WVALID = 0; + s_axilite_WDATA = 'x; + s_axilite_BREADY = 0; + s_axilite_ARVALID = 0; + s_axilite_ARADDR = 'x; + + // Stream Input + ivld = 0; + idat = 'x; + + @(posedge clk iff !rst); + + // Threshold Configuration + for(int unsigned c = 0; c < C; c+=PE) begin + automatic addr_t addr = 0; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = c/PE; + for(int unsigned pe = 0; pe < PE; pe++) begin + if(PE > 1) addr[N+:$clog2(PE)] = pe; + for(int unsigned t = 0; t < 2**N-1; t++) begin + addr[0+:N] = t; + fork + begin + s_axilite_AWVALID <= 1; + s_axilite_AWADDR <= { addr, 2'b00 }; + @(posedge clk iff s_axilite_AWREADY); + s_axilite_AWVALID <= 0; + s_axilite_AWADDR <= 'x; + end + begin + s_axilite_WVALID <= 1; + s_axilite_WDATA <= THRESHS[c+pe][t]; + @(posedge clk iff s_axilite_WREADY); + s_axilite_WVALID <= 0; + s_axilite_WDATA <= 'x; + end + begin + s_axilite_BREADY <= 1; + @(posedge clk iff s_axilite_BVALID); + assert(s_axilite_BRESP == '0) else begin + $error("Error on parameter write."); + $stop; + end + s_axilite_BREADY <= 0; + end + join + end + end + end + + fork + // Intermittent configuration readback + while(!done) begin + if(($urandom()%37) != 0) begin + s_axilite_ARVALID <= 0; + s_axilite_ARADDR <= 'x; + @(posedge clk); + end + else begin + automatic addr_t addr = $urandom()%(N-1); + if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF; + + s_axilite_ARVALID <= 1; + s_axilite_ARADDR <= { addr, 2'b00 }; + @(posedge clk iff s_axilite_ARREADY); + + QC.push_back(addr); + end + end + + // AXI4Stream MST Writes input values + repeat(MST_STRM_WROUNDS) begin + automatic input_t dat; + + while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk); + + std::randomize(dat); + ivld <= 1; + idat <= dat; + @(posedge clk iff irdy); + ivld <= 0; + idat <= 'x; + QW.push_back(dat); + end + join_any + done <= 1; + repeat(N+6) @(posedge clk); + + assert(QW.size() == 0) else begin + $error("Missing %0d outputs.", QW.size()); + $stop; + end + assert(QC.size() == 0) else begin + $error("Missing %0d readback replies.", QC.size()); + $stop; + end + + $display("Test completed: %0d errors in %0d tests.", error_cnt, MST_STRM_WROUNDS); + $display("========================================="); + $finish; + end + + // Output Checker ------------------------------------------------------- + + // Configuration Readback + always_ff @(posedge clk iff s_axilite_RVALID) begin + assert(s_axilite_RRESP == '0) else begin + $error("Read back error."); + $stop; + end + assert(QC.size()) begin + automatic addr_t addr = QC.pop_front(); + automatic int unsigned cnl = + (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) + + (PE == 1? 0 : addr[N+:$clog2(PE)]); + automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]]; + assert(s_axilite_RDATA == exp) else begin + $error("Readback mismatch on #%0d.%0d: %0d instead of %0d", cnl, addr[0+:N], s_axilite_RDATA, exp); + $stop; + end + end + else begin + $error("Spurious readback output."); + $stop; + end + end + + // Stream Output + int unsigned OCnl = 0; + always @(posedge clk) begin + if(rst) begin + OCnl <= 0; + ordy <= 1'b0; + end + else begin + if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED; + + if(ordy && ovld) begin + assert(QW.size()) begin + automatic input_t x = QW.pop_front(); + + for(int unsigned pe = 0; pe < PE; pe++) begin + automatic int unsigned cnl = OCnl + pe; + + $display("Mapped CNL=%0d DAT=%3d -> #%2d", cnl, x[pe], odat[pe]); + assert( + ((odat[pe] == 0) || (THRESHS[cnl][odat[pe]-1] <= x[pe])) && + ((odat[pe] == 2**N-1) || (x[pe] < THRESHS[cnl][odat[pe]])) + ) else begin + $error("Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", cnl, x[pe], odat[pe]); + error_cnt++; + $stop; + end + end + end + else begin + $error("Spurious output."); + $stop; + end + + OCnl <= (OCnl + PE)%C; + end + end + end + +endmodule: thresholding_axi_tb diff --git a/finn-rtllib/thresholding/sim/thresholding_tb.sv b/finn-rtllib/thresholding/sim/thresholding_tb.sv new file mode 100644 index 0000000000..e42145f10e --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresholding_tb.sv @@ -0,0 +1,274 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for thresholding_axi. + * @author Monica Chiosa + * + */ + +module thresholding_tb #( + int unsigned K = 10, // input precision + int unsigned N = 4, // output precision + int unsigned C = 6, // number of channels + int unsigned PE = 2, + + localparam int unsigned CF = C/PE // Channel Fold +); + localparam bit DEEP_PIPELINE = 1; + + localparam int unsigned MST_STRM_WROUNDS = 507; + localparam bit THROTTLED = 1; + + //----------------------------------------------------------------------- + // Clock and Reset Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + #10ns; + @(posedge clk); + rst <= 0; + end + + //----------------------------------------------------------------------- + // Parallel Instances differing in Data Type + typedef logic [K -1:0] val_t; + typedef val_t threshs_t[C][2**N-1]; + typedef val_t [PE-1:0] input_t; + typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t; + logic [0:2] term = '0; + always_comb begin + if(&term) $finish; + end + for(genvar i = 0; i < 3; i++) begin : genTypes + localparam bit SIGNED = i>0; + localparam bit FPARG = i>1; + + //- DUT ------------------------- + logic cfg_en; + logic cfg_we; + logic [$clog2(C)+N-1:0] cfg_a; + logic [K-1:0] cfg_d; + uwire cfg_rack; + uwire [K-1:0] cfg_q; + + uwire irdy; + logic ivld; + logic [PE-1:0][K-1:0] idat; + + logic ordy = 0; + uwire ovld; + uwire [PE-1:0][N-1:0] odat; + + thresholding #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(SIGNED), .FPARG(FPARG), .USE_CONFIG(1), .DEEP_PIPELINE(DEEP_PIPELINE)) dut ( + .clk, .rst, + + // Configuration + .cfg_en, .cfg_we, .cfg_a, .cfg_d, + .cfg_rack, .cfg_q, + + // Stream Processing + .irdy, .ivld, .idat, + .ordy, .ovld, .odat + ); + + //- Stimulus Driver ------------- + threshs_t THRESHS; + function val_t sigord(input val_t x); + automatic val_t res = x; + if(SIGNED) begin + if(FPARG && x[K-1]) res[K-2:0] = ~x[K-2:0]; + res[K-1] = !x[K-1]; + end + return res; + endfunction : sigord + + input_t QW[$]; // Input tracing + addr_t QC[$]; // Readback tracking + int unsigned error_cnt = 0; + bit done = 0; + initial begin + + // Generate thresholds + std::randomize(THRESHS); + foreach(THRESHS[c]) begin + val_t row[2**N-1] = THRESHS[c]; + row.sort with (sigord(item)); + THRESHS[c] = row; + end + + // Report test case details + $display("[%0d] Thresholding %s%s%0d -> uint%0d", i, SIGNED? "s" : "u", FPARG? "fp" : "int", K, N); + for(int unsigned c = 0; c < C; c++) begin + $write("[%0d] Channel #%0d: Thresholds = {", i, c); + for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0X", THRESHS[c][i]); + $display(" }"); + end + + // Config + cfg_en = 0; + cfg_we = 'x; + cfg_a = 'x; + cfg_d = 'x; + + // Stream Input + ivld = 0; + idat = 'x; + + @(posedge clk iff !rst); + + // Threshold Configuratin + cfg_en <= 1; + cfg_we <= 1; + for(int unsigned c = 0; c < C; c+=PE) begin + if(CF > 1) cfg_a[N+$clog2(PE)+:$clog2(CF)] <= c/PE; + for(int unsigned pe = 0; pe < PE; pe++) begin + if(PE > 1) cfg_a[N+:$clog2(PE)] = pe; + for(int unsigned t = 0; t < 2**N-1; t++) begin + cfg_a[0+:N] <= t; + cfg_d <= THRESHS[c+pe][t]; + @(posedge clk); + end + end + end + cfg_d <= 'x; + + fork + // Intermittent configuration readback + while(!done) begin + cfg_en <= 0; + cfg_we <= 'x; + cfg_a <= 'x; + @(posedge clk); + if(($urandom()%41) == 0) begin + automatic addr_t addr = $urandom()%(N-1); + if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF; + + cfg_en <= 1; + cfg_we <= 0; + cfg_a <= addr; + @(posedge clk); + QC.push_back(addr); + end + end + + // AXI4Stream MST Writes input values + repeat(MST_STRM_WROUNDS) begin + automatic input_t dat; + + while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk); + + std::randomize(dat); + ivld <= 1; + idat <= dat; + @(posedge clk iff irdy); + ivld <= 0; + idat <= 'x; + QW.push_back(dat); + end + join_any + done <= 1; + repeat((DEEP_PIPELINE+1)*N+6) @(posedge clk); + + assert(QW.size() == 0) else begin + $error("[%0d] Missing %0d outputs.", i, QW.size()); + $stop; + end + assert(QC.size() == 0) else begin + $error("[%0d] Missing %0d readback replies.", i, QC.size()); + $stop; + end + + $display("[%0d] Test completed: %0d errors in %0d tests.", i, error_cnt, MST_STRM_WROUNDS); + $display("============================================="); + term[i] <= 1; + end + + //- Readback Checker -------------- + always_ff @(posedge clk iff cfg_rack) begin + assert(QC.size()) begin + automatic addr_t addr = QC.pop_front(); + automatic int unsigned cnl = + (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) + + (PE == 1? 0 : addr[N+:$clog2(PE)]); + automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]]; + assert(cfg_q == exp) else begin + $error("[%0d] Readback mismatch on #%0d.%0d: %0d instead of %0d", i, cnl, addr[0+:N], cfg_q, exp); + $stop; + end + end + else begin + $error("[%0d] Spurious readback output.", i); + $stop; + end + end + + // Output Checker + int unsigned OCnl = 0; + always @(posedge clk) begin + if(rst) begin + OCnl <= 0; + ordy <= 1'b0; + end + else begin + if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED; + + if(ordy && ovld) begin + assert(QW.size()) begin + automatic input_t x = QW.pop_front(); + + for(int unsigned pe = 0; pe < PE; pe++) begin + automatic int unsigned cnl = OCnl + pe; + + $display("[%0d] Mapped CNL=%0d DAT=%3x -> #%2d", i, cnl, x[pe], odat[pe]); + assert( + ((odat[pe] == 0) || (sigord(THRESHS[cnl][odat[pe]-1]) <= sigord(x[pe]))) && + ((odat[pe] == 2**N-1) || (sigord(x[pe]) < sigord(THRESHS[cnl][odat[pe]]))) + ) else begin + $error("[%0d] Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", i, cnl, x[pe], odat[pe]); + error_cnt++; + $stop; + end + end + end + else begin + $error("[%0d] Spurious output.", i); + $stop; + end + + OCnl <= (OCnl + PE)%C; + end + end + end + + end : genTypes + +endmodule: thresholding_tb diff --git a/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl b/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl new file mode 100644 index 0000000000..338304fa40 --- /dev/null +++ b/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl @@ -0,0 +1,187 @@ + +# Loading additional proc with user specified bodies to compute parameter values. +source [file join [file dirname [file dirname [info script]]] gui/thresholding_axi_v1_0.gtcl] + +# Definitional proc to organize widgets for parameters. +proc init_gui { IPINST } { + ipgui::add_param $IPINST -name "Component_Name" + #Adding Page + set Page_0 [ipgui::add_page $IPINST -name "Page 0"] + ipgui::add_param $IPINST -name "ADDR_BITS" -parent ${Page_0} + ipgui::add_param $IPINST -name "BIAS" -parent ${Page_0} + ipgui::add_param $IPINST -name "C" -parent ${Page_0} + ipgui::add_param $IPINST -name "CF" -parent ${Page_0} + ipgui::add_param $IPINST -name "FPARG" -parent ${Page_0} + ipgui::add_param $IPINST -name "K" -parent ${Page_0} + ipgui::add_param $IPINST -name "N" -parent ${Page_0} + ipgui::add_param $IPINST -name "O_BITS" -parent ${Page_0} + set PE [ipgui::add_param $IPINST -name "PE" -parent ${Page_0}] + set_property tooltip {PE Count} ${PE} + ipgui::add_param $IPINST -name "SIGNED" -parent ${Page_0} + + +} + +proc update_PARAM_VALUE.ADDR_BITS { PARAM_VALUE.ADDR_BITS PARAM_VALUE.C PARAM_VALUE.PE PARAM_VALUE.N } { + # Procedure called to update ADDR_BITS when any of the dependent parameters in the arguments change + + set ADDR_BITS ${PARAM_VALUE.ADDR_BITS} + set C ${PARAM_VALUE.C} + set PE ${PARAM_VALUE.PE} + set N ${PARAM_VALUE.N} + set values(C) [get_property value $C] + set values(PE) [get_property value $PE] + set values(N) [get_property value $N] + set_property value [gen_USERPARAMETER_ADDR_BITS_VALUE $values(C) $values(PE) $values(N)] $ADDR_BITS +} + +proc validate_PARAM_VALUE.ADDR_BITS { PARAM_VALUE.ADDR_BITS } { + # Procedure called to validate ADDR_BITS + return true +} + +proc update_PARAM_VALUE.CF { PARAM_VALUE.CF PARAM_VALUE.C PARAM_VALUE.PE } { + # Procedure called to update CF when any of the dependent parameters in the arguments change + + set CF ${PARAM_VALUE.CF} + set C ${PARAM_VALUE.C} + set PE ${PARAM_VALUE.PE} + set values(C) [get_property value $C] + set values(PE) [get_property value $PE] + set_property value [gen_USERPARAMETER_CF_VALUE $values(C) $values(PE)] $CF +} + +proc validate_PARAM_VALUE.CF { PARAM_VALUE.CF } { + # Procedure called to validate CF + return true +} + +proc update_PARAM_VALUE.O_BITS { PARAM_VALUE.O_BITS PARAM_VALUE.BIAS PARAM_VALUE.N } { + # Procedure called to update O_BITS when any of the dependent parameters in the arguments change + + set O_BITS ${PARAM_VALUE.O_BITS} + set BIAS ${PARAM_VALUE.BIAS} + set N ${PARAM_VALUE.N} + set values(BIAS) [get_property value $BIAS] + set values(N) [get_property value $N] + set_property value [gen_USERPARAMETER_O_BITS_VALUE $values(BIAS) $values(N)] $O_BITS +} + +proc validate_PARAM_VALUE.O_BITS { PARAM_VALUE.O_BITS } { + # Procedure called to validate O_BITS + return true +} + +proc update_PARAM_VALUE.BIAS { PARAM_VALUE.BIAS } { + # Procedure called to update BIAS when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.BIAS { PARAM_VALUE.BIAS } { + # Procedure called to validate BIAS + return true +} + +proc update_PARAM_VALUE.C { PARAM_VALUE.C } { + # Procedure called to update C when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.C { PARAM_VALUE.C } { + # Procedure called to validate C + return true +} + +proc update_PARAM_VALUE.FPARG { PARAM_VALUE.FPARG } { + # Procedure called to update FPARG when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.FPARG { PARAM_VALUE.FPARG } { + # Procedure called to validate FPARG + return true +} + +proc update_PARAM_VALUE.K { PARAM_VALUE.K } { + # Procedure called to update K when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.K { PARAM_VALUE.K } { + # Procedure called to validate K + return true +} + +proc update_PARAM_VALUE.N { PARAM_VALUE.N } { + # Procedure called to update N when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.N { PARAM_VALUE.N } { + # Procedure called to validate N + return true +} + +proc update_PARAM_VALUE.PE { PARAM_VALUE.PE } { + # Procedure called to update PE when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.PE { PARAM_VALUE.PE } { + # Procedure called to validate PE + return true +} + +proc update_PARAM_VALUE.SIGNED { PARAM_VALUE.SIGNED } { + # Procedure called to update SIGNED when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.SIGNED { PARAM_VALUE.SIGNED } { + # Procedure called to validate SIGNED + return true +} + + +proc update_MODELPARAM_VALUE.N { MODELPARAM_VALUE.N PARAM_VALUE.N } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.N}] ${MODELPARAM_VALUE.N} +} + +proc update_MODELPARAM_VALUE.K { MODELPARAM_VALUE.K PARAM_VALUE.K } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.K}] ${MODELPARAM_VALUE.K} +} + +proc update_MODELPARAM_VALUE.C { MODELPARAM_VALUE.C PARAM_VALUE.C } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.C}] ${MODELPARAM_VALUE.C} +} + +proc update_MODELPARAM_VALUE.PE { MODELPARAM_VALUE.PE PARAM_VALUE.PE } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.PE}] ${MODELPARAM_VALUE.PE} +} + +proc update_MODELPARAM_VALUE.SIGNED { MODELPARAM_VALUE.SIGNED PARAM_VALUE.SIGNED } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.SIGNED}] ${MODELPARAM_VALUE.SIGNED} +} + +proc update_MODELPARAM_VALUE.FPARG { MODELPARAM_VALUE.FPARG PARAM_VALUE.FPARG } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.FPARG}] ${MODELPARAM_VALUE.FPARG} +} + +proc update_MODELPARAM_VALUE.BIAS { MODELPARAM_VALUE.BIAS PARAM_VALUE.BIAS } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.BIAS}] ${MODELPARAM_VALUE.BIAS} +} + +proc update_MODELPARAM_VALUE.CF { MODELPARAM_VALUE.CF PARAM_VALUE.CF } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.CF}] ${MODELPARAM_VALUE.CF} +} + +proc update_MODELPARAM_VALUE.ADDR_BITS { MODELPARAM_VALUE.ADDR_BITS PARAM_VALUE.ADDR_BITS } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.ADDR_BITS}] ${MODELPARAM_VALUE.ADDR_BITS} +} + +proc update_MODELPARAM_VALUE.O_BITS { MODELPARAM_VALUE.O_BITS PARAM_VALUE.O_BITS } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.O_BITS}] ${MODELPARAM_VALUE.O_BITS} +} diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index d6c0794b00..c29a805b62 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -63,6 +63,9 @@ from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch +from finn.custom_op.fpgadataflow.thresholding_binary_search import ( + Thresholding_Binary_Search, +) from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation @@ -86,6 +89,7 @@ custom_op["FMPadding_Batch"] = FMPadding_Batch custom_op["FMPadding_Pixel"] = FMPadding_Pixel custom_op["Thresholding_Batch"] = Thresholding_Batch +custom_op["Thresholding_Binary_Search"] = Thresholding_Binary_Search custom_op["AddStreams_Batch"] = AddStreams_Batch custom_op["LabelSelect_Batch"] = LabelSelect_Batch custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py new file mode 100755 index 0000000000..d02b778823 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -0,0 +1,579 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import warnings +from qonnx.core.datatype import DataType +from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import find_next_power_of_2, get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import ( + npy_to_rtlsim_input, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + +"""@package thresholding_binary_search +- ONNX i/o tensor shape assumptions for Thresholding: +- input 0 is the input tensor, shape (..., NumChannels) +- input 1 is the threshold tensor, shape (NumChannels, n_thres) +- output 0 is the output tensor, shape (..., NumChannels) - same as input +- the '...' here can be any shape (representing groups of vectors) + +This module creates an RTL IP, HLS is not supported. See 'thresholding_batch' +for a HLS equivalent. +""" + + +class Thresholding_Binary_Search(HLSCustomOp): + """Class that corresponds to finn-rtllib 'thresholding' function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # parallelization; channels thresholded per cycle + "PE": ("i", True, 0), + # number of channels (each may have different thresholds) + "NumChannels": ("i", True, 0), + # number of steps in thresholding function. Used only in decoupled mode + "numSteps": ("i", True, 1), + # FINN DataTypes for inputs, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + # name of the top module in verilog template. Used by PyVerilator + # and IPI generation + "gen_top_module": ("s", False, ""), + # bias to be applied to outputs of the node + "activation_bias": ("i", False, 0), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def calc_tmem(self): + """Calculates and returns TMEM.""" + num_channels = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return num_channels // pe + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + """Used for FINN DataType inference: set the output tensors' datatypes + accordingly for this node""" + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype().name), + str(idt.name), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + """Required by the FINN nalysis module. Checks if custom ops in graph + are correctly built, with all attributes and inputs.""" + return [] + + def bram_estimation(self): + return 0 + + def lut_estimation(self): + return 0 + + def get_input_datatype(self, ind=0): + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + return DataType[self.get_nodeattr("outputDataType")] + + def get_weight_datatype(self): + """The term 'weights' and 'thresholds' are used interchangably in this class.""" + return DataType[self.get_nodeattr("weightDataType")] + + def minimize_accumulator_width(self, model): + "Minimize threshold width ('accumulator width' here due to convention)" + thresholds = model.get_initializer(self.onnx_node.input[1]) + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + min_input = self.get_input_datatype().min() + max_input = self.get_input_datatype().max() + # get range required by threshold values + tdt_min = min(min_input, min_threshold) + tdt_max = max(max_input, max_threshold) + if tdt_min < 0: + if abs(tdt_min) > tdt_max: + tdt = DataType.get_smallest_possible(tdt_min) + else: + tdt = DataType.get_smallest_possible(-tdt_max - 1) + else: + tdt = DataType.get_smallest_possible(tdt_max) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + self.set_nodeattr("weightDataType", tdt.name) + return DataType[self.get_nodeattr("weightDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_weightstream_width(self): + """Returns weight stream width""" + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + n_thres_steps = self.get_nodeattr("numSteps") + w_width = pe * wp * n_thres_steps + return w_width + + def get_folded_input_shape(self, ind=0): + fold = self.calc_tmem() + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self, ind=0): + num_channels = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [num_channels]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + return 0 + + def get_exp_cycles(self): + return 0 + + def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for unsigned inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ + mh = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + tmem = mh // pe + assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." + assert ( + orig_thres_matrix.ndim == 2 + ), """Threshold matrix dimension is + not as expected (2).""" + n_thres_steps = orig_thres_matrix.shape[1] + assert n_thres_steps == self.get_nodeattr( + "numSteps" + ), "Mismatch in threshold steps" + if not self.get_input_datatype().signed(): + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert np.equal( + np.mod(orig_thres_matrix, 1), 0 + ).all(), "Need int threshold tensor" + ret = orig_thres_matrix + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (mh, 1)) + assert ( + ret.shape[0] == mh + ), "Channels of threshold matrix are not as expected (mh)" + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + assert ( + ret.shape[2] == n_thres_steps + ), """Third dimension after distribution of the + rows between PEs is not as expected (n_thres_steps)""" + return ret.reshape(1, pe, tmem, n_thres_steps) + + def prepare_codegen_rtl_values(self): + """All dictionary values produced in this function are to replace + their key value(s) in the RTL template files""" + code_gen_dict = {} + + # Identify the module name + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ + self.get_verilog_top_module_name() + "_axi_wrapper" + ] + # Set the top module name - AXI wrapper + code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] + + # Identify the module variables + output_data_type = self.get_nodeattr("outputDataType") # output precision + input_data_type = self.get_nodeattr( + "inputDataType" + ) # input/threshold precision + num_channels = self.get_nodeattr("NumChannels") # number of channels + bias = self.get_nodeattr("activation_bias") # activation bias value + pe = self.get_nodeattr("PE") + + code_gen_dict["$N$"] = [ + str(DataType[output_data_type].bitwidth()) + ] # output precision - convert bitwidth to string + code_gen_dict["$M$"] = [ + str(DataType[input_data_type].bitwidth()) + ] # input/threshold precision - convert bitwidth to string + code_gen_dict["$C$"] = [str(num_channels)] # number of channels + code_gen_dict["$BIAS$"] = [str(bias)] # activation bias value + code_gen_dict["$PE$"] = [str(pe)] # requires C = M*PE + + # Is the input datatype signed or unsigned? + # The thresholding core needs to know this when comparing weights to inputs + if self.get_input_datatype().signed(): + code_gen_dict["$SIGNED$"] = [str(1)] + else: + code_gen_dict["$SIGNED$"] = [str(0)] + + return code_gen_dict + + def get_rtl_file_list(self): + """Thresholding binary search RTL file list""" + return ["thresholding.sv", "thresholding_axi.sv", "thresholding_axi_wrapper.v"] + + def get_rtl_file_paths(self): + """Get full path of all RTL files""" + rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/" + rtl_file_list = self.get_rtl_file_list() + rtl_file_paths = [rtl_root_dir + file for file in rtl_file_list] + return rtl_file_paths + + def get_rtl_template_data(self, path): + """Return RTL file contents as a template""" + with open(path, "r") as f: + template = f.read() + return template + + def fill_in_rtl_template_data(self, replace_dict, template_data): + """Use attribute values to finn in RTL template placeholders""" + template_data_cp = template_data + for key in replace_dict: + replacement_line = "\n".join(replace_dict[key]) + template_data_cp = template_data_cp.replace(key, replacement_line) + return template_data_cp + + def dump_rtl_data(self, dest_dir, filename, data): + """Dump filled-in-template RTL files for future synthesis step""" + with open(os.path.join(dest_dir, filename), "w") as f: + f.write(data) + return + + def generate_hdl(self): + """Prepare HDL files from templates for synthesis""" + # Generate a dictionary of values to put in RTL template + code_gen_dict = self.prepare_codegen_rtl_values() + + # Retrieve the destination directory for the final RTL files + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + for rtl_file_path in self.get_rtl_file_paths(): + # read in original RTL template file + template_data = self.get_rtl_template_data(rtl_file_path) + # apply code generation to templates + data = self.fill_in_rtl_template_data(code_gen_dict, template_data) + # dump filled-in template to destination directory for compilation + file_only_path = rtl_file_path.split("/")[-1] + self.dump_rtl_data(code_gen_dir, file_only_path, data) + + # Before we return - set the 'gen_top_module' attribute for use later + # by PyVerilator and IPI generation + self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) + return + + def code_generation_ipgen(self, model, fpgapart, clk): + self.generate_hdl() + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + # i.e. during the HLSSynthIP() transformation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + return + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = self.get_rtl_file_list() + + # build the Verilator emulation library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_nodeattr("gen_top_module"), + ) + + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def execute_node(self, context, graph): + # Perform input checks + if self.get_nodeattr("exec_mode") != "rtlsim": + raise Exception( + "Invalid exec_mode value: {}; exec_mode must be set to '{}'".format( + self.get_nodeattr("exec_mode"), "rtlsim" + ) + ) + + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for Thresholding_Binary_Search") + in_ind += 1 + + # Create a PyVerilator wrapper of the RTLSim .so + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + + super().reset_rtlsim(sim) + super().toggle_clk(sim) + + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"s_axis": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + + # Manage output data + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + return + + def code_generation_ipi(self): + """Constructs and returns the TCL commands for node instantiation as an RTL + block.""" + cmd = [] + rtl_file_list = self.get_rtl_file_list() + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + for rtl_file in rtl_file_list: + cmd.append( + "add_files -norecurse %s" % (os.path.join(code_gen_dir, rtl_file)) + ) + + # Create an RTL block, not an IP core (-type ip) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ) + + return cmd + + def get_verilog_top_module_intf_names(self): + """Return a dict of names of input and output interfaces. + The keys reflect the protocols each interface implements: + 'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'. + Values are lists of tuples (axis, aximm) or names (axilite): + 'axis' tuples correspond to the list of node inputs in order, + each tuple is (interface_name, interface_width_bits). + axilite always assumed to be 32 bits and is not tuple (name only). + Each block must have at most one aximm and one axilite.""" + + intf_names = super().get_verilog_top_module_intf_names() + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def get_dynamic_config(self, model, address_stride=1): + """Returns a configuration dictionary containing axilite write commands + in order to program the thresholds into the RTL core during runtime. + The default address stride for the weights is 1 byte.""" + + thresholds = model.get_initializer(self.onnx_node.input[1]) + num_channels, num_weights_per_channel = thresholds.shape + + weight_addr_boundary = find_next_power_of_2(num_weights_per_channel) + # Make sure that the next power of 2 (output) is greater than the input + assert weight_addr_boundary >= num_weights_per_channel + + config = {} + channel_cntr = 0 + for channel in thresholds: + channel_start_addr = channel_cntr * weight_addr_boundary * address_stride + weight_cntr = 0 + addr = 0 + for weight in channel: + key_name = "{}_{}{}_{}{}".format( + "axilite", "ch", str(channel_cntr), "w", str(weight_cntr) + ) + config[key_name] = ( + channel_start_addr + addr, + int( + str( + pack_innermost_dim_as_hex_string( + [weight], + self.get_weight_datatype(), + self.get_weight_datatype().bitwidth(), + ) + ), + 0, + ), + ) + + weight_cntr += 1 + addr += address_stride + + channel_cntr += 1 + + return config + + def ipgen_singlenode_code(self): + """Normally: Builds the bash script for IP generation.""" + """This is needed for the HLSSynthIP() transformation. + This is an IP, not a HLS node, so therefore provide an empty hook + to prevent any HLS synthesis.""" + pass + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index ef02453498..a50cbbaed1 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -1019,9 +1019,10 @@ def apply(self, model): class InferThresholdingLayer(Transformation): """Convert any MultiThreshold into a standalone thresholding HLS layer.""" - def __init__(self, mem_mode="const"): + def __init__(self, mem_mode="const", use_rtl_variant=False): super().__init__() self.mem_mode = mem_mode + self.use_rtl_variant = use_rtl_variant def apply(self, model): graph = model.graph @@ -1073,27 +1074,65 @@ def apply(self, model): ) actval = int(actval) assert (not odt.signed()) or (actval < 0), ( - node.name + ": Signed output requres actval < 0" - ) - # create and insert new Thresholding_Batch node - new_node = helper.make_node( - "Thresholding_Batch", - [thl_input, thl_threshold], - [thl_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=ifc, - PE=pe, - numSteps=thl_thres_shape[1], - inputDataType=idt.name, - # weightDataType can be tightened by MinimizeAccumulatorWidth - weightDataType=idt.name, - outputDataType=odt.name, - numInputVectors=list(thl_in_shape[:-1]), - ActVal=actval, - mem_mode=self.mem_mode, - name="Thresholding_Batch_" + node.name, + node.name + ": Signed output requires actval < 0" ) + + # Ensure that RTL variant is not inserted for unsupported configuration + is_rtl_variant_compatible = True + + # Perform checks for RTL variant if chosen + if self.use_rtl_variant: + assert self.mem_mode == "decoupled", ( + """%s : RTL Thresholding only supports 'decoupled' memory + mode.""" + % node.name + ) + + if self.use_rtl_variant and is_rtl_variant_compatible: + new_node = helper.make_node( + "Thresholding_Binary_Search", + [thl_input, thl_threshold], + [thl_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=ifc, + PE=pe, + numSteps=thl_thres_shape[1], + inputDataType=idt.name, + weightDataType=idt.name, + outputDataType=odt.name, + numInputVectors=list(thl_in_shape[:-1]), + activation_bias=actval, + mem_mode=self.mem_mode, + name="Thresholding_Binary_Search_" + node.name, + ) + else: + if self.use_rtl_variant: + warnings.warn( + """%s : RTL Thresholding requested for unsupported + configuration. Falling back to HLS implementation.""" + % node.name + ) + + # create and insert new Thresholding_Batch node + new_node = helper.make_node( + "Thresholding_Batch", + [thl_input, thl_threshold], + [thl_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=ifc, + PE=pe, + numSteps=thl_thres_shape[1], + inputDataType=idt.name, + weightDataType=idt.name, + outputDataType=odt.name, + numInputVectors=list(thl_in_shape[:-1]), + ActVal=actval, + mem_mode=self.mem_mode, + name="Thresholding_Batch_" + node.name, + ) + graph.node.insert(insert_point, new_node) # remove old node graph.node.remove(node) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 1796738c58..5252422dcf 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -228,3 +228,22 @@ def is_exe(fpath): return exe_file return None + + +def find_next_power_of_2(n): + """For any integer 'n', find the next greatest power of 2""" + # Negative values will loop infinitely below - return 0 + if n <= 0: + return 0 + # If '1' is requested, output will be '0' in the loop below, avoid this now. + elif n == 1: + return 2 # i.e. 2**1 + + # decrement 'n' (to handle cases when `n` itself is a power of 2) + n = n - 1 + + # loop until only one bit is left + while n & n - 1: + # unset rightmost bit + n = n & n - 1 + return n << 1 diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py new file mode 100755 index 0000000000..9c233bdd06 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -0,0 +1,276 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper +from pyverilator.util.axi_utils import axilite_write, reset_rtlsim +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.multithreshold import multithreshold +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor +from test_fpgadataflow_thresholding_binary_search import ( + make_single_thresholding_binary_search_modelwrapper, +) + +import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +from finn.core.rtlsim_exec import rtlsim_exec +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP + +test_fpga_part = "xczu3eg-sbva484-1-e" +target_clk_ns = 5 + + +# Helper functions +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + + +def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): + return np.random.randint( + input_data_type.min(), + input_data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + + +def generate_pe_value(fold, num_input_channels): + if fold == -1: + fold = num_input_channels + pe = num_input_channels // fold + assert num_input_channels % pe == 0 + return pe + + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NCHW to NHWC +def convert_np_array_to_finn_data_layout(data): + return np.transpose(data, (0, 2, 3, 1)) + + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NHWC to NCHW +def convert_np_array_to_standard_data_layout(data): + return np.transpose(data, (0, 3, 1, 2)) + + +def make_single_multithresholding_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, +): + NumChannels = thresholds.shape[0] + + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels] + ) + + node_inp_list = ["inp", "thresh"] + + Multithresholding_node = helper.make_node( + "MultiThreshold", + node_inp_list, + ["outp"], + domain="qonnx.custom_op.general", + out_dtype=output_data_type.name, + out_bias=float(activation_bias), + out_scale=1.0, + ) + + graph = helper.make_graph( + nodes=[Multithresholding_node], + name="multithresholding_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="multithresholding-model") + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + + model.set_tensor_datatype("inp", input_data_type) + model.set_tensor_datatype("outp", output_data_type) + + model.set_tensor_datatype("thresh", input_data_type) + model.set_initializer("thresh", thresholds) + return model + + +# N.B. Fold values where C % PE != 0 fail +@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) +@pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) +@pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6]) +@pytest.mark.parametrize("num_input_channels", [16]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_convert_to_hls_tbs_rtl_variant( + activation, + input_data_type, + fold, + num_input_channels, +): + # Handle inputs to the test + pe = generate_pe_value(fold, num_input_channels) + num_steps = activation.get_num_possible_values() - 1 + + # See convert_to_hls_layers::InferThresholdingLayer: + # assert (not odt.signed()) or (actval < 0) + # This implies that it expects a negative activation, BIPOLAR does not provide that + if activation == DataType["BIPOLAR"]: + pytest.skip( + "Only negative activations are supported for " + "RTL Thresholding Binary Search node" + ) + + # Other non-input parameters + num_input_vecs = [1, 2, 2] + output_data_type = activation + if output_data_type == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = output_data_type.min() + + # generate random input data + tensor_shape = tuple(num_input_vecs + [num_input_channels]) + x = gen_finn_dt_tensor(input_data_type, tensor_shape) + + # Generate random thresholds and sort in ascending order + thresholds = generate_random_threshold_values( + input_data_type, num_input_channels, num_steps + ) + + # provide non-decreasing/ascending thresholds + thresholds = sort_thresholds_increasing(thresholds) + + x_nhwc = convert_np_array_to_standard_data_layout(x) + y = multithreshold(x_nhwc, thresholds) + + # convert back to NHWC for comparison to hw outputs + y = convert_np_array_to_finn_data_layout(y) + if activation == DataType["BIPOLAR"]: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += activation.min() + + # Generate model from input parameters to the test + model = make_single_thresholding_binary_search_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, + ) + + model = model.transform(InsertFIFO(True)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + + # Retrieve the axilite programming sequence for weights - for decoupled mode only + tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] + tbs_inst = getCustomOp(tbs_node) + config = tbs_inst.get_dynamic_config(model, 4) + + # Reshape generated data (not from model) + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + + # Helper function that delivers the hook to program the thresholds via AXI-Lite + def config_hook(config): + if config is None: + return None + + def write_thresh_config(sim): + # axi_name = "s_axilite_0_" # works + axi_name = getCustomOp( + model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] + ).get_verilog_top_module_intf_names()["axilite"][0] + axi_name += "_0_" + + # Write config registers to the Threshold memory. + # The dictionary defines (addr, value) tuples. + for config_entry in config.values(): + addr = config_entry[0] + val = config_entry[1] + axilite_write(sim, addr, val, basename=axi_name) + + reset_rtlsim(sim) + + return write_thresh_config + + input_dict = {"inp": x} + rtlsim_exec(model, input_dict, pre_hook=config_hook(config)) + y_produced = input_dict["outp"] + assert (y_produced == y_expected).all() + + # Make a Multithreshold graph and convert to thresholding binary search node + new_model = make_single_multithresholding_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, + ) + + # Recreate the model using the ConvertToHLS transform + new_model = new_model.transform( + to_hls.InferThresholdingLayer(mem_mode="decoupled", use_rtl_variant=True) + ) + new_model = new_model.transform(InsertFIFO(True)) + new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + new_model = new_model.transform(HLSSynthIP()) + new_model = new_model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + + input_dict = {"inp": x} + rtlsim_exec(new_model, input_dict, pre_hook=config_hook(config)) + y_produced_new = input_dict["outp"] + assert (y_produced_new == y_expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py new file mode 100755 index 0000000000..24b60f5ea5 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -0,0 +1,287 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper +from pyverilator.util.axi_utils import axilite_write, reset_rtlsim +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.multithreshold import multithreshold +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.util.basic import gen_finn_dt_tensor + +from finn.core.rtlsim_exec import rtlsim_exec +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode + +test_fpga_part = "xczu3eg-sbva484-1-e" +target_clk_ns = 5 + + +# Helper functions +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + + +def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): + return np.random.randint( + input_data_type.min(), + input_data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + + +def generate_pe_value(fold, num_input_channels): + if fold == -1: + fold = num_input_channels + pe = num_input_channels // fold + assert num_input_channels % pe == 0 + return pe + + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NCHW to NHWC +def convert_np_array_to_finn_data_layout(data): + return np.transpose(data, (0, 2, 3, 1)) + + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NHWC to NCHW +def convert_np_array_to_standard_data_layout(data): + return np.transpose(data, (0, 3, 1, 2)) + + +def make_single_thresholding_binary_search_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, +): + + NumChannels = thresholds.shape[0] + + inp = helper.make_tensor_value_info( + "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels] + ) + outp = helper.make_tensor_value_info( + "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels] + ) + + node_inp_list = ["inp", "thresh"] + + Thresholding_node = helper.make_node( + "Thresholding_Binary_Search", + node_inp_list, + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=NumChannels, + PE=pe, + numSteps=thresholds.shape[1], + inputDataType=input_data_type.name, + weightDataType=input_data_type.name, + outputDataType=output_data_type.name, + activation_bias=activation_bias, + numInputVectors=num_input_vecs, + ) + graph = helper.make_graph( + nodes=[Thresholding_node], + name="thresholding_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="thresholding-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", input_data_type) + model.set_tensor_datatype("outp", output_data_type) + + model.set_tensor_datatype("thresh", input_data_type) + model.set_initializer("thresh", thresholds) + return model + + +# Test brief: Test that PrepareRTLSim() runs successfully. This function is not +# tested in test_fpgadataflow_thresholding_binary_search() +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): + input_data_type = DataType["INT16"] + act = DataType["INT4"] + fold = -1 + num_input_channels = 16 + + # Handle inputs to the test + pe = generate_pe_value(fold, num_input_channels) + num_steps = act.get_num_possible_values() - 1 + + # Generate random, non-decreasing thresholds + thresholds = generate_random_threshold_values( + input_data_type, num_input_channels, num_steps + ) + thresholds = sort_thresholds_increasing(thresholds) + + # Other non-input parameters + num_input_vecs = [1, 2, 2] + output_data_type = act + if output_data_type == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = output_data_type.min() + + # Generate model from input parameters to the test + model = make_single_thresholding_binary_search_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, + ) + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + return + + +# Test brief: Create a Thresholding binary search layer using various parameters +# and test against a SW generated & simulated dataset +# N.B. Fold values where C % PE != 0 fail +@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) +@pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) +@pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6]) +@pytest.mark.parametrize("num_input_channels", [16]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +@pytest.mark.slow +def test_fpgadataflow_thresholding_binary_search( + activation, input_data_type, fold, num_input_channels +): + # Handle inputs to the test + pe = generate_pe_value(fold, num_input_channels) + num_steps = activation.get_num_possible_values() - 1 + + # Other non-input parameters + num_input_vecs = [1, 2, 2] + output_data_type = activation + if output_data_type == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = output_data_type.min() + + # generate random input data + tensor_shape = tuple(num_input_vecs + [num_input_channels]) + x = gen_finn_dt_tensor(input_data_type, tensor_shape) + + # Generate random thresholds and sort in ascending order + thresholds = generate_random_threshold_values( + input_data_type, num_input_channels, num_steps + ) + + # provide non-decreasing/ascending thresholds + thresholds = sort_thresholds_increasing(thresholds) + + x_nhwc = convert_np_array_to_standard_data_layout(x) + y = multithreshold(x_nhwc, thresholds) + + # convert back to NHWC for comparison to hw outputs + y = convert_np_array_to_finn_data_layout(y) + if activation == DataType["BIPOLAR"]: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += activation.min() + + # Generate model from input parameters to the test + model = make_single_thresholding_binary_search_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, + ) + + model = model.transform(InsertFIFO(True)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + + # Retrieve the axilite programming sequence for weights - for decoupled mode only + tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] + tbs_inst = getCustomOp(tbs_node) + config = tbs_inst.get_dynamic_config(model, 4) + + # Reshape generated data (not from model) + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + + # Helper function that delivers the hook to program the thresholds via AXI-Lite + def config_hook(config): + if config is None: + return None + + def write_thresh_config(sim): + # axi_name = "s_axilite_0_" # works + axi_name = getCustomOp( + model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] + ).get_verilog_top_module_intf_names()["axilite"][0] + axi_name += "_0_" + + # Write config registers to the Threshold memory. + # The dictionary defines (addr, value) tuples. + for config_entry in config.values(): + addr = config_entry[0] + val = config_entry[1] + axilite_write(sim, addr, val, basename=axi_name) + + reset_rtlsim(sim) + + return write_thresh_config + + input_dict = {"inp": x} + rtlsim_exec(model, input_dict, pre_hook=config_hook(config)) + y_produced = input_dict["outp"] + assert (y_produced == y_expected).all() diff --git a/tests/util/test_basic.py b/tests/util/test_basic.py new file mode 100755 index 0000000000..97a8c50261 --- /dev/null +++ b/tests/util/test_basic.py @@ -0,0 +1,60 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import finn.util.basic as basic + + +@pytest.mark.util +def test_next_power_of_2(): + test_vector = [ + {"input": -2, "expected_result": 0}, + {"input": -1, "expected_result": 0}, + {"input": 0, "expected_result": 0}, + {"input": 1, "expected_result": 2}, + {"input": 2, "expected_result": 2}, + {"input": 3, "expected_result": 4}, + {"input": 4, "expected_result": 4}, + {"input": 7, "expected_result": 8}, + {"input": 8, "expected_result": 8}, + {"input": 11, "expected_result": 16}, + {"input": 15, "expected_result": 16}, + {"input": 16, "expected_result": 16}, + {"input": 18, "expected_result": 32}, + {"input": 27, "expected_result": 32}, + {"input": 31, "expected_result": 32}, + {"input": 32, "expected_result": 32}, + {"input": 42, "expected_result": 64}, + {"input": 65, "expected_result": 128}, + ] + + for test_dict in test_vector: + output = basic.find_next_power_of_2(test_dict["input"]) + assert output >= test_dict["input"] + assert output == test_dict["expected_result"] From ffcca3c005f83ef768024bc1da18578fb2139c83 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Mon, 15 Jan 2024 16:21:31 +0000 Subject: [PATCH 032/291] [BTS-RTLLIB] Fix threshold weight file path --- finn-rtllib/thresholding/hdl/thresholding.sv | 3 +-- finn-rtllib/thresholding/sim/thresh_gen.sv | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index 75fbb61a4d..dc612f387f 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -209,8 +209,7 @@ module thresholding #( (* RAM_STYLE = RAM_STYLE *) val_t Threshs[DEPTH]; if(THRESHOLDS_PATH != "") begin - localparam FILE = $sformatf("%s/threshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage); - initial $readmemh(FILE, Threshs); + initial $readmemh($sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage), Threshs); end if(USE_CONFIG) begin : genThreshMem diff --git a/finn-rtllib/thresholding/sim/thresh_gen.sv b/finn-rtllib/thresholding/sim/thresh_gen.sv index a8a18be691..713723aafa 100644 --- a/finn-rtllib/thresholding/sim/thresh_gen.sv +++ b/finn-rtllib/thresholding/sim/thresh_gen.sv @@ -12,7 +12,7 @@ module thresh_gen; '{ 'h40, 'h41, 'h42, 'h43, 'h44, 'h45, 'h46, 'h47, 'h48, 'h49, 'h4a, 'h4b, 'h4c, 'h4d, 'h4e }, '{ 'h50, 'h51, 'h52, 'h53, 'h54, 'h55, 'h56, 'h57, 'h58, 'h59, 'h5a, 'h5b, 'h5c, 'h5d, 'h5e } }; - localparam THRESHOLDS_PATH = "."; + localparam THRESHOLDS_PATH = "./"; localparam int unsigned PE = 2; localparam int unsigned CF = C/PE; @@ -21,7 +21,7 @@ module thresh_gen; localparam int unsigned SN = N-1-stage; for(genvar pe = 0; pe < PE; pe++) begin initial begin - automatic string file = $sformatf("%s/threshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage); + automatic string file = $sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage); automatic thresh_t threshs[CF * 2**stage]; for(int unsigned c = 0; c < CF; c++) begin From ce0ebbceaad67604011a634677152c9bb6d3620c Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Mon, 15 Jan 2024 16:25:52 +0000 Subject: [PATCH 033/291] [BTS-RTLLIB] Use templates for module wrapper name --- finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v index 3f0b012ef1..79e7ad1bb7 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v @@ -31,7 +31,7 @@ * @brief Verilog wrapper for IP packaging. */ -module thresholding_template_wrapper #( +module $MODULE_NAME_AXI_WRAPPER$ #( parameter N = $N$, // output precision parameter K = $M$, // input/threshold precision parameter C = $C$, // Channels @@ -117,4 +117,4 @@ module thresholding_template_wrapper #( .m_axis_tready(out_V_tready), .m_axis_tvalid(out_V_tvalid), .m_axis_tdata(out_V_tdata) ); -endmodule // thresholding_template_wrapper +endmodule // $MODULE_NAME_AXI_WRAPPER$ From 5384739c6a34134f88d199d32cb63643fd005bdc Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Mon, 15 Jan 2024 16:26:53 +0000 Subject: [PATCH 034/291] [BTS-RTLLIB] Upper case signal names --- .../hdl/thresholding_template_wrapper.v | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v index 79e7ad1bb7..ef76a23cbc 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v @@ -84,14 +84,14 @@ module $MODULE_NAME_AXI_WRAPPER$ #( output [ 1:0] s_axilite_RRESP, //- AXI Stream - Input -------------- - output in0_V_tready, - input in0_V_tvalid, - input [((PE*K+7)/8)*8-1:0] in0_V_tdata, + output in0_V_TREADY, + input in0_V_TVALID, + input [((PE*K+7)/8)*8-1:0] in0_V_TDATA, //- AXI Stream - Output ------------- - input out_V_tready, - output out_V_tvalid, - output [((PE*O_BITS+7)/8)*8-1:0] out_V_tdata + input out_V_TREADY, + output out_V_TVALID, + output [((PE*O_BITS+7)/8)*8-1:0] out_V_TDATA ); thresholding_axi #( @@ -113,8 +113,8 @@ module $MODULE_NAME_AXI_WRAPPER$ #( .s_axilite_ARVALID(s_axilite_ARVALID), .s_axilite_ARREADY(s_axilite_ARREADY), .s_axilite_ARADDR(s_axilite_ARADDR), .s_axilite_RVALID(s_axilite_RVALID), .s_axilite_RREADY(s_axilite_RREADY), .s_axilite_RDATA(s_axilite_RDATA), .s_axilite_RRESP(s_axilite_RRESP), - .s_axis_tready(in0_V_tready), .s_axis_tvalid(in0_V_tvalid), .s_axis_tdata(in0_V_tdata), - .m_axis_tready(out_V_tready), .m_axis_tvalid(out_V_tvalid), .m_axis_tdata(out_V_tdata) + .s_axis_tready(in0_V_TREADY), .s_axis_tvalid(in0_V_TVALID), .s_axis_tdata(in0_V_TDATA), + .m_axis_tready(out_V_TREADY), .m_axis_tvalid(out_V_TVALID), .m_axis_tdata(out_V_TDATA) ); endmodule // $MODULE_NAME_AXI_WRAPPER$ From 7a0d5b7c3e32900049fb6df2ffa265648ef806ee Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 16 Jan 2024 17:43:09 +0000 Subject: [PATCH 035/291] [RTLBackend] Move top module node attribute into backend --- src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py | 4 +--- .../fpgadataflow/rtl/streamingdatawidthconverter_rtl.py | 5 +---- src/finn/custom_op/fpgadataflow/rtlbackend.py | 7 +++++-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py index 3c8a1ad777..b8a1505018 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -56,8 +56,6 @@ def get_nodeattr_types(self): # Enable reprogrammable implementation to change FM dimensions, # stride, or dilation during runtime "dynamic_mode": ("i", False, 0, {0, 1}), - # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, ""), } my_attrs.update(FMPadding.get_nodeattr_types(self)) my_attrs.update(RTLBackend.get_nodeattr_types(self)) diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py index 2d17897afe..6fcfaa1db0 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py @@ -48,10 +48,7 @@ class StreamingDataWidthConverter_rtl(StreamingDataWidthConverter, RTLBackend): module.""" def get_nodeattr_types(self): - my_attrs = { - # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, ""), - } + my_attrs = {} my_attrs.update(StreamingDataWidthConverter.get_nodeattr_types(self)) my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py index 4c1977852c..96deb49161 100644 --- a/src/finn/custom_op/fpgadataflow/rtlbackend.py +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -36,7 +36,10 @@ class RTLBackend(ABC): when writing a new RTL custom op node.""" def get_nodeattr_types(self): - return {} + return { + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), + } @abstractmethod def generate_hdl(self): From 8f4e151433d1ddbb4b53ad7b768c474ce92417ba Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Wed, 17 Jan 2024 11:49:50 +0000 Subject: [PATCH 036/291] [BTS] Add memory estimation helper functions Signed-off-by: aziz bahri --- .../thresholding_binary_search.py | 75 ++++++++++++++++--- 1 file changed, 63 insertions(+), 12 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index d02b778823..7d53d81de8 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -85,10 +85,60 @@ def get_nodeattr_types(self): "gen_top_module": ("s", False, ""), # bias to be applied to outputs of the node "activation_bias": ("i", False, 0), + # whether weights (thresholds) will be + # writable through an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # memory depth triggers for threshold storage + "depth_trigger_uram": ("i", False, 0), + "depth_trigger_bram": ("i", False, 0), + # enable uniform thres optimization + # doesn't actually do anything yet, only + # for resource estimations + "uniform_thres": ("i", False, 0, {0, 1}), + # enable deep pipelining for easier timing closure + # setting to 0 may save some FFs but otherwise leave on + "deep_pipeline": ("i", False, 1, {0, 1}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs + def get_pe_mem_geometries(self): + pe = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + wdt_bits = wdt.bitwidth() + odt = self.get_output_datatype() + odt_bits = odt.bitwidth() + t_channels = self.get_nodeattr("NumChannels") + cf = t_channels / pe + is_uniform = self.get_nodeattr("uniform_thres") + if is_uniform: + ret = [(odt_bits - x, cf * (2**x)) for x in range(1, odt_bits)] + else: + ret = [(wdt_bits, (cf) * 2**x) for x in range(odt_bits)] + return ret + + def get_memory_estimate(self): + res_dict = {} + depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") + depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") + pe = self.get_nodeattr("PE") + ret = self.get_pe_mem_geometries() + for mem_cfg in ret: + (width, depth) = mem_cfg + primitives = mem_primitives_versal + if depth_trigger_bram != 0 or depth_trigger_uram != 0: + if depth >= depth_trigger_bram and depth < depth_trigger_uram: + primitives = {k: v for (k, v) in mem_primitives_versal.items() if "BRAM" in k} + elif depth >= depth_trigger_uram: + primitives = {k: v for (k, v) in mem_primitives_versal.items() if "URAM" in k} + alts = get_memutil_alternatives(mem_cfg, primitives) + primary_alt = alts[0] + res_type = primary_alt[0].split("_")[0] + res_count, eff, waste = primary_alt[1] + res_dict[res_type] = res_dict.get(res_type, 0) + pe * res_count + return res_dict + def calc_tmem(self): """Calculates and returns TMEM.""" num_channels = self.get_nodeattr("NumChannels") @@ -122,10 +172,16 @@ def verify_node(self): return [] def bram_estimation(self): - return 0 + res_dict = self.get_memory_estimate() + return res_dict.get("BRAM", 0) + + def uram_estimation(self): + res_dict = self.get_memory_estimate() + return res_dict.get("URAM", 0) def lut_estimation(self): - return 0 + res_dict = self.get_memory_estimate() + return res_dict.get("LUTRAM", 0) def get_input_datatype(self, ind=0): return DataType[self.get_nodeattr("inputDataType")] @@ -202,7 +258,8 @@ def get_number_output_values(self): return 0 def get_exp_cycles(self): - return 0 + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): """Convert the original numpy weight matrix orig_weight_matrix into @@ -221,23 +278,17 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): ), """Threshold matrix dimension is not as expected (2).""" n_thres_steps = orig_thres_matrix.shape[1] - assert n_thres_steps == self.get_nodeattr( - "numSteps" - ), "Mismatch in threshold steps" + assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" if not self.get_input_datatype().signed(): # ensure all thresholds are nonnegative assert (orig_thres_matrix >= 0).all() # ensure all thresholds are integer - assert np.equal( - np.mod(orig_thres_matrix, 1), 0 - ).all(), "Need int threshold tensor" + assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" ret = orig_thres_matrix # ensure channels = mh , duplicating if necessary if ret.shape[0] == 1: ret = np.tile(ret, (mh, 1)) - assert ( - ret.shape[0] == mh - ), "Channels of threshold matrix are not as expected (mh)" + assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" # distribute rows between PEs ret = interleave_matrix_outer_dim_from_partitions(ret, pe) assert ( From 7156f3f20c31dbf523f80c8c67e35bd8422b1bdc Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 17 Jan 2024 20:07:27 +0000 Subject: [PATCH 037/291] [CustomOp] Initial draft of convinputgen in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 10 +- .../fpgadataflow/convolutioninputgenerator.py | 424 +++------------ .../custom_op/fpgadataflow/hls/__init__.py | 4 + .../convolutioninputgenerator_hls.py} | 491 ++++++++---------- .../custom_op/fpgadataflow/rtl/__init__.py | 4 + .../convolutioninputgenerator_rtl.py | 210 +------- .../fpgadataflow/convert_to_hw_layers.py | 31 +- .../fpgadataflow/specialize_layers.py | 36 ++ .../test_fpgadataflow_convinputgenerator.py | 262 +++++++--- .../test_fpgadataflow_convinputgenerator1d.py | 268 ---------- ...est_fpgadataflow_convinputgenerator_rtl.py | 245 --------- 11 files changed, 593 insertions(+), 1392 deletions(-) rename src/finn/custom_op/fpgadataflow/{convolutioninputgenerator1d.py => hls/convolutioninputgenerator_hls.py} (63%) rename src/finn/custom_op/fpgadataflow/{ => rtl}/convolutioninputgenerator_rtl.py (85%) delete mode 100644 tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py delete mode 100755 tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 157dfa5c53..8254083ef7 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -34,12 +34,6 @@ from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( ConvolutionInputGenerator, ) -from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import ( - ConvolutionInputGenerator1D, -) -from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import ( - ConvolutionInputGenerator_rtl, -) from finn.custom_op.fpgadataflow.downsampler import DownSampler from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams from finn.custom_op.fpgadataflow.fmpadding import FMPadding @@ -69,9 +63,6 @@ # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure custom_op["MatrixVectorActivation"] = MatrixVectorActivation -custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator -custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D -custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl custom_op["TLastMarker"] = TLastMarker custom_op["StreamingFIFO"] = StreamingFIFO custom_op["Pool_Batch"] = Pool_Batch @@ -86,6 +77,7 @@ custom_op["FMPadding"] = FMPadding custom_op["AddStreams"] = AddStreams custom_op["ChannelwiseOp"] = ChannelwiseOp +custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["DownSampler"] = DownSampler custom_op["DuplicateStreams"] = DuplicateStreams custom_op["GlobalAccPool"] = GlobalAccPool diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 33c542d79d..3be0a117a8 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,33 +26,24 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import math import numpy as np -import os +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator: # input 0 is the input tensor, shape NHWC = (1, IFMDim, IFMDim, IFMChannels) # output 0 is the output tensor, shape NHWC: # = (1, OFMDim, OFMDim, (ConvKernelDim^2)*IFMChannels) -# note: the actual data layout produced by the hlslib kernels is different -# for depthwise and non-depthwise ops. -# * non-depthwise SWG: (1, OFMDim, OFMDim, K, K, IFMChannels/SIMD, SIMD) -# * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/SIMD, K, K, SIMD) -# see test_fpgadataflow_slidingwindow.py for an example of how to transform -# between the two layouts - -class ConvolutionInputGenerator(HLSCustomOp): - """Class that corresponds to one of the finn-hlslib ConvolutionInputGenerator - (sliding window) function variants. Depending on the combination of - attributes (e.g. depthwise or not, whether k % stride is 0) a different - variant will be picked for the actual HLS implementation.""" +class ConvolutionInputGenerator(HWCustomOp): + """Abstraction layer for HW implementation of ConvolutionInputGenerator""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -82,23 +73,13 @@ def get_nodeattr_types(self): "distributed", {"auto", "block", "distributed", "ultra"}, ), + "parallel_window": ("i", False, 0, {0, 1}), + # 1D (True) or 2D (False) spatial data + "is1D": ("i", False, 0), } my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_nodeattr(self, name): - # overriding get_nodeattr to check for square kernel/img.. requirement - # since this can't be done with the attribute restriction in nodeattr_types - # TODO non-square can be enabled in theory but needs testing - ret = super().get_nodeattr(name) - props_to_check = ["ConvKernelDim", "IFMDim", "OFMDim", "Stride", "Dilation"] - if name in props_to_check: - is_square = ret[0] == ret[1] - assert is_square, "Only square %s supported" % name - if name == "Dilation": - assert ret[0] == ret[1] == 1, "Only dilation=1 supported" - return ret - def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -137,8 +118,12 @@ def get_folded_output_shape(self, ind=0): ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int((k_h * k_w * ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) + if self.use_parallel_window_output(): + wf = int((ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) + else: + wf = int((k_h * k_w * ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) return folded_oshape def make_shape_compatible_op(self, model): @@ -177,330 +162,93 @@ def get_instream_width(self, ind=0): return in_width def get_outstream_width(self, ind=0): - """Returns stream width, input and output stream width are equal for - the sliding window function, so the function to determine the input - stream width can be reused.""" - return self.get_instream_width() + if self.use_parallel_window_output(): + # feed all window pixels in parallel + k_h, k_w = self.get_nodeattr("ConvKernelDim") + return self.get_instream_width() * k_h * k_w + else: + # if parallel variant not in use: same width for output and input stream + return self.get_instream_width() def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() num_output_elems = np.prod(folded_oshape[:-1]) return num_output_elems - def get_exp_cycles(self): - simd = self.get_nodeattr("SIMD") + def get_1d_conv_attrs_normalized(self): + # support both (1, D) and (D, 1) cases transparently: + # For the kernel, presenting the input data of size D as + # [H, W] = [Y, X] = [1, D] or [D, 1] + # effectively gives the same result. + # For consistency and ease of programming, this function + # returns the attributes of the layer as follows: + # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. + # The dummy ('1') dimension is the Y-dimension. ifm_ch = self.get_nodeattr("IFMChannels") - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ofm_dim_h, ofm_dim_w = self.get_nodeattr("OFMDim") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - - # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h - cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv - cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) - max_cycles = max(cycles_write_block, cycles_read_block) - exp_cycles = ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + + # see defines() for an explanation + if ifm_dim[1] == 1: + ifm_dim = ifm_dim[::-1] + ofm_dim = ofm_dim[::-1] + k = k[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) - return int(exp_cycles) + def get_exp_cycles(self): + return 0 def bram_estimation(self): - # NOTE: only tested with a square convolution - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - ram_style = self.get_nodeattr("ram_style") - if ram_style == "block" or ram_style == "auto": - ram_depth = ifm_dim * ifm_ch / simd - if ram_depth <= 512: - ram_width = 36 - elif ram_depth <= 1024: - ram_width = 18 - elif ram_depth <= 2048: - ram_width = 9 - elif ram_depth <= 4096: - ram_width = 4 - elif ram_depth <= 8192: - ram_width = 2 - else: - ram_width = 1 - return int( - (k + stride) - * ( - math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width) - * math.ceil(ifm_dim * ifm_ch / simd / ram_depth) - ) - ) - else: - return 0 + return 0 def lut_estimation(self): - # NOTE: only tested with a square convolution - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - ram_style = self.get_nodeattr("ram_style") - if ram_style == "distributed": - ram_luts = int( - (k + stride) - * ( - simd - * self.get_input_datatype().bitwidth() - * math.ceil(ifm_dim * ifm_ch / simd / 64) - ) - ) - else: - ram_luts = 0 - return 300 + ram_luts + return 0 def uram_estimation(self): - # NOTE: only tested with a square convolution - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - ram_style = self.get_nodeattr("ram_style") - if ram_style == "ultra": - return int( - (k + stride) - * ( - math.ceil(simd * self.get_input_datatype().bitwidth() / 64) - * math.ceil(ifm_dim * ifm_ch / simd / 4096) - ) - ) - else: - return 0 + return 0 def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # using Im2Col node to calculate output node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ifm_dim_h, ifm_dim_w, ifm_ch).""" - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim \ - did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output - shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] - - def defines(self, var): - numReps = 1 - ifm_dim = self.get_nodeattr("IFMDim")[0] + ifm_dim = self.get_nodeattr("IFMDim") + k = self.get_nodeattr("ConvKernelDim") + s = self.get_nodeattr("Stride") + d = self.get_nodeattr("Dilation") ifm_ch = self.get_nodeattr("IFMChannels") - ofm_dim = self.get_nodeattr("OFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - simd = self.get_nodeattr("SIMD") - ifm_precision = self.get_input_datatype().bitwidth() - - self.code_gen_dict["$DEFINES$"] = [ - """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n - #define Input_precision1 {}\n #define IFMDim1 {}\n - #define OFMDim1 {}\n #define SIMD1 {}\n - #define Stride1 {}\n #define numReps {}""".format( - k, ifm_ch, ifm_precision, ifm_dim, ofm_dim, simd, stride, numReps - ) - ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + im2col_node = helper.make_node( + "Im2Col", + [node.input[0]], + [node.output[0]], + domain="qonnx.custom_op.general", + stride=[s[0], s[1]], + kernel_size=[k[0], k[1]], + dilations=[d[0], d[1]], + input_shape="(1,{},{},{})".format(ifm_dim[0], ifm_dim[1], ifm_ch), ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) + graph_im2col = helper.make_graph( + nodes=[im2col_node], + name="single-im2col-exec", + inputs=[inp], + outputs=[outp], ) - def docompute(self): - node = self.onnx_node - ram_style = self.get_nodeattr("ram_style") - map_to_hls_ram_style = { - "auto": "ap_resource_dflt()", - "block": "ap_resource_bram()", - "distributed": "ap_resource_lutram()", - "ultra": "ap_resource_uram()", - } - hls_ram_style = map_to_hls_ram_style[ram_style] - hls_call = node.op_type - - # check which ConvolutionInputGenerator is needed - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - - if k % stride != 0: - hls_call += "_kernel_stride" - - if self.get_nodeattr("depthwise") == 1: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}_dws (in0_{}, out_{}, numReps, {});""".format( - hls_call, self.hls_sname(), self.hls_sname(), hls_ram_style - ) - ] - else: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} (in0_{}, out_{}, numReps, {});""".format( - hls_call, self.hls_sname(), self.hls_sname(), hls_ram_style - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{})""".format( - self.onnx_node.name, self.hls_sname(), self.hls_sname() - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_im2col = ModelWrapper(qonnx_make_model(graph_im2col, **onnx_kwargs)) + model_im2col.set_tensor_datatype(node.input[0], self.get_input_datatype()) + # use execution function from Im2Col node + # this automatically updates the execution context + inst = getCustomOp(im2col_node) + inst.execute_node(context, model_im2col.graph) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 8b1ca6e719..bcf36dad67 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -28,6 +28,9 @@ from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls +from finn.custom_op.fpgadataflow.hls.convolutioninputgenerator_hls import ( + ConvolutionInputGenerator_hls, +) from finn.custom_op.fpgadataflow.hls.downsampler_hls import DownSampler_hls from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls @@ -47,6 +50,7 @@ # registered and plug in correctly into the infrastructure custom_op["AddStreams_hls"] = AddStreams_hls custom_op["ChannelwiseOp_hls"] = ChannelwiseOp_hls +custom_op["ConvolutionInputGenerator_hls"] = ConvolutionInputGenerator_hls custom_op["DownSampler_hls"] = DownSampler_hls custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls custom_op["FMPadding_hls"] = FMPadding_hls diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py similarity index 63% rename from src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py rename to src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py index 046e8e096d..7223996e8b 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py +++ b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py @@ -1,4 +1,5 @@ # Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,15 +32,13 @@ import os import warnings from qonnx.core.datatype import DataType -from qonnx.custom_op.general.im2col import compute_conv_output_dim -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( + ConvolutionInputGenerator, +) +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -# This operation should only be used for 1D convolutions. Either the -# IFMDim_H or IFMDim_W should be '1', which represents the so-called -# dummy-dimension - # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator1D: # input 0 is the input tensor, shape NHWC = (1, IFMDim_H, IFMDim_W, IFMChannels) # output 0 is the output tensor, shape NHWC: @@ -53,7 +52,7 @@ # between the two layouts -class ConvolutionInputGenerator1D(HLSCustomOp): +class ConvolutionInputGenerator_hls(ConvolutionInputGenerator, HLSBackend): """Class that corresponds to one of the 1D finn-hlslib ConvolutionInputGenerator (sliding window) function variants. Depending on the combination of attributes (e.g. depthwise or not, whether dilation is 0) a different @@ -63,175 +62,49 @@ def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] - "IFMChannels": ("i", True, 0), - "IFMDim": ("ints", True, []), # [H, W] = [Y, X] - "OFMDim": ("ints", True, []), # [H, W] = [Y, X] - "SIMD": ("i", True, 0), - "Stride": ("ints", True, []), # [H, W] = [Y, X] - "Dilation": ("ints", True, []), # [H, W] = [Y, X] - # FINN DataTypes for inputs, weights, outputs - "inputDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - "depthwise": ("i", False, 0, {0, 1}), - # FPGA resource type for ConvolutionInputGenerator input buffer - # auto -- let Vivado HLS decide - # block -- use BRAM - # distributed -- use LUTRAM - # ultra -- use URAM - "ram_style": ( - "s", - False, - "distributed", - {"auto", "block", "distributed", "ultra"}, - ), - "parallel_window": ("i", False, 0, {0, 1}), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(ConvolutionInputGenerator.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_normal_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int(ifm_ch / simd) - folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - simd = self.get_nodeattr("SIMD") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - if self.use_parallel_window_output(): - wf = int((ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) - else: - wf = int((k_h * k_w * ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) - return folded_oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # data type stays the same - dtype = model.get_tensor_datatype(node.input[0]) - model.set_tensor_datatype(node.output[0], dtype) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - in_width = simd * ibits - return in_width - - def get_outstream_width(self, ind=0): - if self.use_parallel_window_output(): - # feed all window pixels in parallel - k_h, k_w = self.get_nodeattr("ConvKernelDim") - return self.get_instream_width() * k_h * k_w - else: - # if parallel variant not in use: same width for output and input stream - return self.get_instream_width() - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - num_output_elems = np.prod(folded_oshape[:-1]) - return num_output_elems - def get_swu_variant(self): - # checks which variant of the 1D ConvolutionInputGenerator (SWU) can be used - # We have 5 variants: ConvolutionInputGenerator_1D_parallel, + # checks which variant of the ConvolutionInputGenerator (SWU) can be used + # For the 2D case, we have 4 variants: + # ConvolutioninputGenerator, ConvolutioninputGenerator_dws, + # ConvolutioninputGenerator_kernel_stride, ConvolutioninputGenerator_kernel_stride_dws + # For the 1D case, we have 5 variants: ConvolutionInputGenerator_1D_parallel, # ConvolutionInputGenerator_1D_dws_naive, ConvolutionInputGenerator_1D, # ConvolutioninputGenerator_1D_dws, ConvolutionInputGenerator_1D_dws_stride is_dws = self.get_nodeattr("depthwise") - is_strided = np.prod(self.get_nodeattr("Stride")) > 1 - is_stride_2 = np.prod(self.get_nodeattr("Stride")) == 2 - is_dilated = np.prod(self.get_nodeattr("Dilation")) > 1 - if self.use_parallel_window_output(): - return "ConvolutionInputGenerator_1D_parallel" - if not is_dws: - return "ConvolutionInputGenerator_1D" - if is_dws: - if (is_strided and not is_stride_2) or (is_dilated): - return "ConvolutionInputGenerator_1D_dws_naive" - elif is_stride_2: - return "ConvolutionInputGenerator_1D_dws_stride" - else: - return "ConvolutionInputGenerator_1D_dws" - - def get_1d_conv_attrs_normalized(self): - # support both (1, D) and (D, 1) cases transparently: - # For the kernel, presenting the input data of size D as - # [H, W] = [Y, X] = [1, D] or [D, 1] - # effectively gives the same result. - # For consistency and ease of programming, this function - # returns the attributes of the layer as follows: - # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. - # The dummy ('1') dimension is the Y-dimension. - ifm_ch = self.get_nodeattr("IFMChannels") - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") - ofm_dim = self.get_nodeattr("OFMDim") - stride = self.get_nodeattr("Stride") - dilation = self.get_nodeattr("Dilation") - - # see defines() for an explanation - if ifm_dim[1] == 1: - ifm_dim = ifm_dim[::-1] - ofm_dim = ofm_dim[::-1] - k = k[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + if self.get_nodeattr("is1D"): + is_strided = np.prod(self.get_nodeattr("Stride")) > 1 + is_stride_2 = np.prod(self.get_nodeattr("Stride")) == 2 + is_dilated = np.prod(self.get_nodeattr("Dilation")) > 1 + if self.use_parallel_window_output(): + return "ConvolutionInputGenerator_1D_parallel" + if not is_dws: + return "ConvolutionInputGenerator_1D" + if is_dws: + if (is_strided and not is_stride_2) or (is_dilated): + return "ConvolutionInputGenerator_1D_dws_naive" + elif is_stride_2: + return "ConvolutionInputGenerator_1D_dws_stride" + else: + return "ConvolutionInputGenerator_1D_dws" + else: + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + hls_call = "ConvolutionInputGenerator" + if k % stride != 0: + hls_call += "_kernel_stride" + if is_dws: + hls_call += "_dws" + return hls_call def use_parallel_window_output(self): - # Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to + if not self.get_nodeattr("is1D"): + return False + # If 1D, check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to # feed window in parallel to the following layer, enabling full SIMD unfolding. stride = self.get_nodeattr("Stride") dilation = self.get_nodeattr("Dilation") @@ -261,61 +134,88 @@ def use_parallel_window_output(self): def get_exp_cycles(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() - - # since mmv != 1 is not supported yet, we set mmv for now to 1 - # mmv = 1 - # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h - swu_variant = self.get_swu_variant() - if swu_variant == "ConvolutionInputGenerator_1D_parallel": - exp_cycles = k_w + ofm_dim_w - elif swu_variant == "ConvolutionInputGenerator_1D": - exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd - elif swu_variant in [ - "ConvolutionInputGenerator_1D_dws", - "ConvolutionInputGenerator_1D_dws_stride", - ]: - exp_cycles = ( - 1 + ofm_dim_w * k_w * ifm_ch / simd + (ifm_ch / simd) * (k_w - 1) - (k_w - 1) - ) - elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": - cycles_read_block = ifm_dim_w * ifm_ch / simd - cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd - exp_cycles = cycles_read_block + cycles_write_block + # 2D case + if not self.get_nodeattr("is1D"): + ifm_ch = self.get_nodeattr("IFMChannels") + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ofm_dim_h, ofm_dim_w = self.get_nodeattr("OFMDim") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h + cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv + cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) + max_cycles = max(cycles_write_block, cycles_read_block) + exp_cycles = ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles + # 1D case + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + + swu_variant = self.get_swu_variant() + if swu_variant == "ConvolutionInputGenerator_1D_parallel": + exp_cycles = k_w + ofm_dim_w + elif swu_variant == "ConvolutionInputGenerator_1D": + exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd + elif swu_variant in [ + "ConvolutionInputGenerator_1D_dws", + "ConvolutionInputGenerator_1D_dws_stride", + ]: + exp_cycles = ( + 1 + ofm_dim_w * k_w * ifm_ch / simd + (ifm_ch / simd) * (k_w - 1) - (k_w - 1) + ) + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + cycles_read_block = ifm_dim_w * ifm_ch / simd + cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd + exp_cycles = cycles_read_block + cycles_write_block return int(exp_cycles) def bram_estimation(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_nodeattr("is1D") + if not is1D: + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() ram_style = self.get_nodeattr("ram_style") swu_variant = self.get_swu_variant() if swu_variant == "ConvolutionInputGenerator_1D_parallel": return 0 if ram_style == "block" or ram_style == "auto": - if swu_variant == "ConvolutionInputGenerator_1D": - ram_depth = (k_w - 1) * ifm_ch / simd - elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": - ram_depth = ifm_dim_w * ifm_ch / simd - elif swu_variant in [ - "ConvolutionInputGenerator_1D_dws", - "ConvolutionInputGenerator_1D_dws_stride", - ]: - ram_depth = k_w * ifm_ch / simd + if not is1D: + ram_depth = ifm_dim * ifm_ch / simd + else: + if swu_variant == "ConvolutionInputGenerator_1D": + ram_depth = (k_w - 1) * ifm_ch / simd + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + ram_depth = ifm_dim_w * ifm_ch / simd + elif swu_variant in [ + "ConvolutionInputGenerator_1D_dws", + "ConvolutionInputGenerator_1D_dws_stride", + ]: + ram_depth = k_w * ifm_ch / simd + # after calculate the ram_depth depending on the variant + # determine ram_width if ram_depth <= 512: ram_width = 36 elif ram_depth <= 1024: @@ -328,27 +228,48 @@ def bram_estimation(self): ram_width = 2 else: ram_width = 1 + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width) - depth_mul = math.ceil(ram_depth / 18432) - return width_mul * depth_mul + if not is1D: + depth_mul = math.ceil(ifm_dim * ifm_ch / simd / ram_depth) + return int((k + stride) * width_mul * depth_mul) + else: + depth_mul = math.ceil(ram_depth / 18432) + return int(width_mul * depth_mul) else: return 0 def lut_estimation(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_noadeattr("is1D") + if not is1D: + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() ram_style = self.get_nodeattr("ram_style") swu_variant = self.get_swu_variant() if swu_variant == "ConvolutionInputGenerator_1D_parallel": ram_luts = math.ceil(simd * self.get_input_datatype().bitwidth() * (k_w + 1) / 64) - elif ram_style == "distributed": + if ram_style == "distributed": + if not is1D: + ram_luts = int( + (k + stride) + * ( + simd + * self.get_input_datatype().bitwidth() + * math.ceil(ifm_dim * ifm_ch / simd / 64) + ) + ) if swu_variant == "ConvolutionInputGenerator_1D": ram_luts = math.ceil(self.get_input_datatype().bitwidth() * (k_w - 1) * ifm_ch / 64) elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": @@ -364,34 +285,51 @@ def lut_estimation(self): def uram_estimation(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_nodeattr("is1D") + if not is1D: + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + ram_style = self.get_nodeattr("ram_style") swu_variant = self.get_swu_variant() if swu_variant == "ConvolutionInputGenerator_1D_parallel": return 0 - elif ram_style == "ultra": - if swu_variant == "ConvolutionInputGenerator_1D": - width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) - depth_mul = math.ceil((k_w - 1) * ifm_ch / simd / 4096) - return width_mul * depth_mul - elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": - width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) - depth_mul = math.ceil(ifm_dim_w * ifm_ch / simd / 4096) - return width_mul * depth_mul - elif swu_variant in [ - "ConvolutionInputGenerator_1D_dws", - "ConvolutionInputGenerator_1D_dws_stride", - ]: - width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) - depth_mul = math.ceil(k_w * ifm_ch / simd / 4096) - return width_mul * depth_mul + if ram_style == "ultra": + if not is1D: + return int( + (k + stride) + * ( + math.ceil(simd * self.get_input_datatype().bitwidth() / 64) + * math.ceil(ifm_dim * ifm_ch / simd / 4096) + ) + ) + else: + if swu_variant == "ConvolutionInputGenerator_1D": + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) + depth_mul = math.ceil((k_w - 1) * ifm_ch / simd / 4096) + return width_mul * depth_mul + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) + depth_mul = math.ceil(ifm_dim_w * ifm_ch / simd / 4096) + return width_mul * depth_mul + elif swu_variant in [ + "ConvolutionInputGenerator_1D_dws", + "ConvolutionInputGenerator_1D_dws_stride", + ]: + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) + depth_mul = math.ceil(k_w * ifm_ch / simd / 4096) + return width_mul * depth_mul else: return 0 @@ -485,18 +423,28 @@ def global_includes(self): def defines(self, var): numReps = 1 - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_nodeattr("is1D") simd = self.get_nodeattr("SIMD") ifm_precision = self.get_input_datatype().bitwidth() + if not is1D: + ifm_dim = self.get_nodeattr("IFMDim")[0] + ifm_ch = self.get_nodeattr("IFMChannels") + ofm_dim = self.get_nodeattr("OFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + swu_variant = self.get_swu_variant() + # check all different 1D scenarios if swu_variant in [ "ConvolutionInputGenerator_1D_parallel", "ConvolutionInputGenerator_1D", @@ -523,7 +471,7 @@ def defines(self, var): numReps, ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws": + elif swu_variant == "ConvolutionInputGenerator_1D_dws": self.code_gen_dict["$DEFINES$"] = [ """ #define ConvKernelDim1_x {}\n @@ -543,7 +491,7 @@ def defines(self, var): numReps, ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": self.code_gen_dict["$DEFINES$"] = [ """ #define ConvKernelDim1_x {}\n @@ -567,6 +515,16 @@ def defines(self, var): numReps, ) ] + # default to 2D cases + else: + self.code_gen_dict["$DEFINES$"] = [ + """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n + #define Input_precision1 {}\n #define IFMDim1 {}\n + #define OFMDim1 {}\n #define SIMD1 {}\n + #define Stride1 {}\n #define numReps {}""".format( + k, ifm_ch, ifm_precision, ifm_dim, ofm_dim, simd, stride, numReps + ) + ] def read_npy_data(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -617,7 +575,7 @@ def docompute(self): hls_ram_style = map_to_hls_ram_style[ram_style] swu_variant = self.get_swu_variant() - # check which ConvolutionInputGenerator is needed + # check which 1D ConvolutionInputGenerator is needed if swu_variant == "ConvolutionInputGenerator_1D_parallel": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} @@ -634,7 +592,7 @@ def docompute(self): swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws": + elif swu_variant == "ConvolutionInputGenerator_1D_dws": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} @@ -642,7 +600,7 @@ def docompute(self): swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws_stride": + elif swu_variant == "ConvolutionInputGenerator_1D_dws_stride": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} @@ -650,7 +608,7 @@ def docompute(self): swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} @@ -658,6 +616,13 @@ def docompute(self): swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} (in0_{}, out_{}, numReps, {});""".format( + swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style + ) + ] def dataoutstrm(self): code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 81110d8b9f..ac75371381 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -26,6 +26,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl import ( + ConvolutionInputGenerator_rtl, +) from finn.custom_op.fpgadataflow.rtl.fmpadding_rtl import FMPadding_rtl from finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl import ( StreamingDataWidthConverter_rtl, @@ -35,5 +38,6 @@ # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure +custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl custom_op["FMPadding_rtl"] = FMPadding_rtl custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py similarity index 85% rename from src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py rename to src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py index 734f75a973..ba3921745f 100755 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +34,10 @@ from qonnx.custom_op.general import im2col from qonnx.custom_op.general.im2col import compute_conv_output_dim -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( + ConvolutionInputGenerator, +) +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy @@ -56,169 +59,34 @@ # NOTE: "Parallel" implementation style not yet implemented in this version! -class ConvolutionInputGenerator_rtl(HLSCustomOp): - """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator - (sliding window) function variants. Generates an RTL ConvolutionInputGenerator - implementation based on (System-)Verilog templates, defined in finn-rtllib/swg.""" +class ConvolutionInputGenerator_rtl(ConvolutionInputGenerator, RTLBackend): + """Class that corresponds to finn-rtllib swg module. + Generates an RTL ConvolutionInputGenerator implementation + based on (System-)Verilog templates, defined in finn-rtllib/swg.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { - "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] - "IFMChannels": ("i", True, 0), - "IFMDim": ("ints", True, []), # [H, W] = [Y, X] - "OFMDim": ("ints", True, []), # [H, W] = [Y, X] - "SIMD": ("i", True, 0), # additional parallelization parameter - not yet implemented "M": ("i", False, 1), - # Enable parallel window output (requires full SIMD unfolding) - "parallel_window": ("i", False, 0, {0, 1}), - "Stride": ("ints", True, []), # [H, W] = [Y, X] - "Dilation": ("ints", True, []), # [H, W] = [Y, X] - # FINN DataTypes for inputs, weights, outputs - "inputDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - "depthwise": ("i", False, 0, {0, 1}), # Enable reprogrammable implementation to change FM dimensions, # stride, or dilation during runtime (requires parallel_window = 0) "dynamic_mode": ("i", False, 0, {0, 1}), - # FPGA resource type for ConvolutionInputGenerator input buffer - # auto -- let Vivado decide - # block -- use BRAM - # distributed -- use LUTRAM - # ultra -- use URAM - "ram_style": ( - "s", - False, - "auto", - {"auto", "block", "distributed", "ultra"}, - ), - # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, ""), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(ConvolutionInputGenerator.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs - def get_normal_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int(ifm_ch / simd) - folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - simd = self.get_nodeattr("SIMD") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - if self.get_nodeattr("parallel_window"): - wf = int((ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) - else: - wf = int((k_h * k_w * ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) - return folded_oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # data type stays the same - dtype = model.get_tensor_datatype(node.input[0]) - model.set_tensor_datatype(node.output[0], dtype) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - in_width = simd * ibits - return in_width - - def get_outstream_width(self, ind=0): - if self.get_nodeattr("parallel_window"): - # feed all window pixels in parallel - k_h, k_w = self.get_nodeattr("ConvKernelDim") - return self.get_instream_width() * k_h * k_w - else: - # if parallel variant not in use: same width for output and input stream - return self.get_instream_width() - def get_number_input_values(self): """Function to get the number of expected input values.""" folded_ishape = self.get_folded_input_shape() num_input_elems = np.prod(folded_ishape[:-1]) return num_input_elems - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - num_output_elems = np.prod(folded_oshape[:-1]) - return num_output_elems - - def get_1d_conv_attrs_normalized(self): - """Returns normalized spatial attributes, where H=1 for the 1D case.""" - # normalize FM dimensions so that: - # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. - # The dummy ('1') dimension is the Y-dimension. - ifm_ch = self.get_nodeattr("IFMChannels") - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") - ofm_dim = self.get_nodeattr("OFMDim") - stride = self.get_nodeattr("Stride") - dilation = self.get_nodeattr("Dilation") - - if ifm_dim[1] == 1: - ifm_dim = ifm_dim[::-1] - ofm_dim = ofm_dim[::-1] - k = k[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + def use_parallel_window_output(self): + return self.get_nodeattr("parallel_window") def get_buffer_depth(self): """Returns total depth of the internal buffer, depending on @@ -1170,55 +1038,3 @@ def get_dynamic_config(self, ifm_dim=None, stride=None, dilation=None): "cfg_last_write": (15 * 4, int(code_gen_dict["$LAST_WRITE_ELEM$"][0])), } return config - - def code_generation_ipgen(self, model, fpgapart, clk): - """Generates (System-)Verilog code for IP generation (instead of HLS code).""" - self.generate_hdl() - - def ipgen_singlenode_code(self): - """Not implemented (RTL component).""" - pass - - def code_generation_cppsim(self, model): - """Not implemented (RTL component).""" - pass - - def compile_singlenode_code(self): - """Not implemented (RTL component).""" - pass - - def global_includes(self): - """Not implemented (RTL component).""" - pass - - def defines(self, var): - """Not implemented (RTL component).""" - pass - - def read_npy_data(self): - """Not implemented (RTL component).""" - pass - - def strm_decl(self): - """Not implemented (RTL component).""" - pass - - def docompute(self): - """Not implemented (RTL component).""" - pass - - def dataoutstrm(self): - """Not implemented (RTL component).""" - pass - - def save_as_npy(self): - """Not implemented (RTL component).""" - pass - - def blackboxfunction(self): - """Not implemented (RTL component).""" - pass - - def pragmas(self): - """Not implemented (RTL component).""" - pass diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index a65c925f97..28b7dba9cb 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -58,6 +58,7 @@ def apply(self, model): i2c_input = n.input[0] i2c_output = n.output[0] i2c_in_shape = model.get_tensor_shape(i2c_input) + i2c_out_shape = model.get_tensor_shape(i2c_output) dt = model.get_tensor_datatype(i2c_input) if not dt.is_integer(): warnings.warn("%s : Input is not int. Can't infer ConvInpGen." % n.name) @@ -69,11 +70,13 @@ def apply(self, model): pad_h = pad_attr[0] + pad_attr[2] pad_w = pad_attr[1] + pad_attr[3] dilation_h, dilation_w = i2c_inst.get_nodeattr("dilations") - # temporary checks until non-square conv support is finalized pad_val = i2c_inst.get_nodeattr("pad_value") + depthwise = i2c_inst.get_nodeattr("depthwise") ifm_ch = i2c_in_shape[-1] ifm_dim_h = i2c_in_shape[1] ifm_dim_w = i2c_in_shape[2] + ofm_dim_h = i2c_out_shape[1] + ofm_dim_w = i2c_out_shape[2] # default params for ConvolutionInputGenerator ConvInpGen_node_idx = node_ind @@ -122,9 +125,9 @@ def apply(self, model): is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w is_equal_stride = stride_h == stride_w - # Ensure that only supported HLS nodes are inserted + is_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: - downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) + downsample_1D = is_1D is1D_unitx = ifm_dim_w == 1 downsample_2D = (not downsample_1D) and is_square_image and is_equal_stride if not (downsample_1D or downsample_2D): @@ -148,9 +151,27 @@ def apply(self, model): is1D=downsample_1D, is1D_unitx=is1D_unitx, ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) else: - continue + ConvInpGen_node = helper.make_node( + "ConvolutionInputGenerator", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ConvKernelDim=[k_h, k_w], + IFMChannels=ifm_ch, + IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], + OFMDim=[ofm_dim_h, ofm_dim_w], + SIMD=ifm_ch, + Stride=[stride_h, stride_w], + Dilation=[dilation_h, dilation_w], + inputDataType=dt.name, + outputDataType=dt.name, + depthwise=depthwise, + is1D=is_1D, + name="ConvolutionInputGenerator_" + n.name, + ) + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes graph.node.remove(n) graph_modified = True diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index eff40f83f3..6c1def628f 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -79,6 +79,18 @@ def _determine_impl_style(node): # check if user setting can be fulfilled # otherwise change impl_style if impl_style == "hls": + if optype == "ConvolutionInputGenerator": + if not _swg_hls_possible(node): + warn_str = ( + """Settings are not supported in HLS. Node %s will automatically be + set to RTL variant.""" + % node.name + ) + warnings.warn(warn_str) + return "rtl" + else: + return "hls" + if hls_variant: return "hls" elif rtl_variant: @@ -149,6 +161,30 @@ def _dwc_determine_impl_style(node): return "hls" +def _swg_hls_possible(node): + # the 2D HLS implementation for SWG + # can only be used for square inputs + # and no dilation + swg = getCustomOp(node) + # extract all attributes to check + k = swg.get_nodeattr("ConvKernelDim") + ifm_dim = swg.get_nodeattr("IFMDim") + ofm_dim = swg.get_nodeattr("OFMDim") + s = swg.get_nodeattr("Stride") + d = swg.get_nodeattr("Dilation") + # check if square and dilation=1 + if ( + k[0] == k[1] + and ifm_dim[0] == ifm_dim[1] + and ofm_dim[0] == ofm_dim[1] + and s[0] == s[1] + and d[0] == d[1] == 1 + ): + return True + else: + return False + + class SpecializeLayers(Transformation): """Specialize all layers to either HLS or RTL variants""" diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index d94b5d6399..07de85d0b5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,11 +33,13 @@ from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -44,26 +47,34 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt): +def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, k * k * ifm_ch] + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] ) im2col_node = helper.make_node( "Im2Col", ["inp"], ["outp"], - domain="qonnx.custom_op.general", - stride=[stride, stride], - kernel_size=[k, k], - input_shape=str((1, ifm_dim, ifm_dim, ifm_ch)), + domain="finn.custom_op.general", + stride=[stride_h, stride_w], + kernel_size=[k_h, k_w], + input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), + dilations=[dilation_h, dilation_w], pad_amount=[0, 0, 0, 0], pad_value=0, - dilations=[dilation, dilation], + depthwise=dw, ) graph = helper.make_graph( nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] @@ -78,86 +89,209 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, d return model -def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0 +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# input datatype +@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT2"]]) +# kernel size +@pytest.mark.parametrize("k", [[2, 2], [3, 3]]) +# input dimension +@pytest.mark.parametrize("ifm_dim", [[6, 6], [8, 8]]) +# input channels +@pytest.mark.parametrize("ifm_ch", [2, 4]) +# Stride +@pytest.mark.parametrize("stride", [[1, 1], [2, 2]]) +# Dilation +@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]]) +# execution mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# input channel parallelism ("SIMD") +@pytest.mark.parametrize("simd", [1, 2]) +# depthwise +@pytest.mark.parametrize("dw", [0, 1]) +# parallel_window enable (MMV_out = M*K) +@pytest.mark.parametrize("parallel_window", [0, 1]) +# in/out MMV ("M") +@pytest.mark.parametrize("m", [1]) +# Flip dimensions +@pytest.mark.parametrize("flip", [False]) +# implementation style +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_slidingwindow( + idt, + k, + ifm_dim, + ifm_ch, + stride, + dilation, + exec_mode, + simd, + dw, + parallel_window, + m, + flip, + impl_style, ): - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, k * k * ifm_ch] - ) + if flip: + if ( + ifm_dim[0] == ifm_dim[1] + and k[0] == k[1] + and stride[0] == stride[1] + and dilation[0] == dilation[1] + ): + pytest.skip("Dimension flip would have no effect") + k = k[::-1] + ifm_dim = ifm_dim[::-1] + stride = stride[::-1] + dilation = dilation[::-1] - SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator", - ["inp"], - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k, k], - IFMChannels=ifm_ch, - IFMDim=[ifm_dim, ifm_dim], - OFMDim=[ofm_dim, ofm_dim], - SIMD=simd, - Stride=[stride, stride], - Dilation=[dilation, dilation], - inputDataType=idt.name, - outputDataType=odt.name, - depthwise=dw, - ) - graph = helper.make_graph( - nodes=[SlidingWindow_node], - name="slidingwindow_graph", - inputs=[inp], - outputs=[outp], - ) + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation - model = qonnx_make_model(graph, producer_name="slidingwindow-model") - model = ModelWrapper(model) + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) + if simd > ifm_ch: + pytest.skip("SIMD cannot be larger than number of input channels") + if ifm_ch % simd != 0: + pytest.skip("SIMD must divide number of input channels") + if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") + if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") + if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1): + pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim") + if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)): + pytest.skip("Not all combinations for stride > k edge case supported in default mode") + if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)): + pytest.skip("Parallel window requires SIMD=C for non-depthwise case") - return model + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) + ofm_dim = [ofm_dim_h, ofm_dim_w] + x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) + # prepare input data + input_dict = prepare_inputs(x) + model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw) + y_expected = oxe.execute_onnx(model, input_dict)["outp"] -def prepare_inputs(input_tensor): - return {"inp": input_tensor} + model = model.transform(to_hw.InferConvInpGen()) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all() + # set impl_style + inst = getCustomOp(model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]) + inst.set_nodeattr("preferred_impl_style", impl_style) + model = model.transform(SpecializeLayers()) + # set simd + inst = getCustomOp(model.graph.node[0]) + inst.set_nodeattr("SIMD", simd) + if model.graph.node[0].op_type == "ConvolutionInputGenerator_rtl": + inst.set_nodeattr("parallel_window", parallel_window) + inst.set_nodeattr("M", m) + + if exec_mode == "cppsim": + if model.graph.node[0].op_type == "ConvolutionInputGenerator_rtl": + pytest.skip("cppsim not supported for RTL DWC") + else: + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + elif exec_mode == "rtlsim": + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + else: + raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow") + + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + if dw == 0: + assert (y_produced == y_expected).all() + else: + y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd) + y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) + y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) + assert (y_produced == y_expected).all() + + if exec_mode == "rtlsim" and impl_style == "hls": + nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") + if nodes: + node = nodes[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 + else: + assert model.graph.node[0].op_type == "ConvolutionInputGenerator_rtl" # input datatype -@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT2"]]) +@pytest.mark.parametrize("idt", [DataType["INT8"]]) # kernel size -@pytest.mark.parametrize("k", [2, 3]) +@pytest.mark.parametrize("k", [[4, 1]]) # input dimension -@pytest.mark.parametrize("ifm_dim", [6, 8]) +@pytest.mark.parametrize("ifm_dim", [[10, 1]]) # input channels -@pytest.mark.parametrize("ifm_ch", [2, 4]) +@pytest.mark.parametrize("ifm_ch", [1, 4]) # Stride -@pytest.mark.parametrize("stride", [1, 2]) +@pytest.mark.parametrize("stride", [[1, 1], [2, 1]]) # Dilation -# Currently only dilation value of 1 is supported -@pytest.mark.parametrize("dilation", [1]) +@pytest.mark.parametrize("dilation", [[1, 1], [2, 1]]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1, 2]) +@pytest.mark.parametrize("simd", [1, 4]) # depthwise @pytest.mark.parametrize("dw", [0, 1]) +# TODO add parallel_window and M option +# implementation style +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw): +def test_fpgadataflow_slidingwindow1d( + idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw, impl_style +): ofm_dim = int(((ifm_dim - k) / stride) + 1) x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch)) - model = make_single_slidingwindow_modelwrapper( + input_dict = prepare_inputs(x) + model = make_single_im2col_modelwrapper( k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw ) + y_expected = oxe.execute_onnx(model, input_dict)["outp"] + + model = model.transform(to_hw.InferConvInpGen()) + model.save("model_before.onnx") + # set impl_style + inst = getCustomOp(model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]) + inst.set_nodeattr("preferred_impl_style", impl_style) + model = model.transform(SpecializeLayers()) + # set simd + inst = getCustomOp(model.graph.node[0]) + inst.set_nodeattr("SIMD", simd) + model.save("model_after.onnx") if exec_mode == "cppsim": - model = model.transform(SetExecMode("cppsim")) - model = model.transform(PrepareCppSim()) - model = model.transform(CompileCppSim()) + if impl_style == "rtl": + pytest.skip("cppsim not supported for RTL DWC") + else: + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) elif exec_mode == "rtlsim": model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) @@ -167,14 +301,8 @@ def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, dilation, e else: raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow") - # prepare input data - input_dict = prepare_inputs(x) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] - golden = make_single_im2col_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt - ) - y_expected = oxe.execute_onnx(golden, input_dict)["outp"] if dw == 0: assert (y_produced == y_expected).all() @@ -184,8 +312,8 @@ def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, dilation, e y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k) assert (y_produced == y_expected).all() - if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("ConvolutionInputGenerator")[0] + if exec_mode == "rtlsim" and impl_style == "hls": + node = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py deleted file mode 100644 index aa89dde5e7..0000000000 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py +++ /dev/null @@ -1,268 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -import numpy as np -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.general.im2col import compute_conv_output_dim -from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames -from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model - -import finn.core.onnx_exec as oxe -from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer -from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode - -fpga_part = "xczu3eg-sbva484-1-e" - - -def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - im2col_node = helper.make_node( - "Im2Col", - ["inp"], - ["outp"], - domain="qonnx.custom_op.general", - stride=[stride_h, stride_w], - kernel_size=[k_h, k_w], - input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), - dilations=[dilation_h, dilation_w], - pad_amount=[0, 0, 0, 0], - pad_value=0, - ) - graph = helper.make_graph( - nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] - ) - - model = qonnx_make_model(graph, producer_name="im2col-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, parallel_window, dw=0 -): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator1D", - ["inp"], - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ifm_dim_h, ifm_dim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=simd, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=idt.name, - outputDataType=odt.name, - depthwise=dw, - parallel_window=parallel_window, - ) - graph = helper.make_graph( - nodes=[SlidingWindow_node], - name="slidingwindow_graph", - inputs=[inp], - outputs=[outp], - ) - - model = qonnx_make_model(graph, producer_name="slidingwindow-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def prepare_inputs(input_tensor): - return {"inp": input_tensor} - - -# input datatype -# @pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT8"]]) -@pytest.mark.parametrize("idt", [DataType["INT8"]]) -# kernel size -@pytest.mark.parametrize("k", [[4, 1]]) -# input dimension -@pytest.mark.parametrize("ifm_dim", [[10, 1]]) -# input channels -@pytest.mark.parametrize("ifm_ch", [1, 4]) -# Stride -@pytest.mark.parametrize("stride", [[1, 1], [2, 1]]) -# Dilation -@pytest.mark.parametrize("dilation", [[1, 1], [2, 1]]) -# execution mode -@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) -# input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1, 4]) -# depthwise -@pytest.mark.parametrize("dw", [0, 1]) -# Flip dimensions -@pytest.mark.parametrize("flip", [False, True]) -# Use parallel window output variant -@pytest.mark.parametrize("parallel_window", [False, True]) -@pytest.mark.fpgadataflow -@pytest.mark.slow -@pytest.mark.vivado -def test_fpgadataflow_slidingwindow_1d( - idt, - k, - ifm_dim, - ifm_ch, - stride, - dilation, - exec_mode, - simd, - dw, - flip, - parallel_window, -): - if flip: - k = k[::-1] - ifm_dim = ifm_dim[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - - if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1): - pytest.skip( - """Dilation value greater than 1 and stride greater than 1 - currently not supported for 1D convolutions""" - ) - if (dilation_h > 1 or dilation_w > 1) and dw == 0: - pytest.skip( - """Dilation value greater than 1 currently not supported - for non-dws 1D convolutions""" - ) - if simd > ifm_ch: - pytest.skip("SIMD cannot be larger than number of input channels") - - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) - ofm_dim = [ofm_dim_h, ofm_dim_w] - - x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) - model = make_single_slidingwindow_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - stride=stride, - dilation=dilation, - idt=idt, - parallel_window=parallel_window, - dw=dw, - ) - - if exec_mode == "cppsim": - model = model.transform(SetExecMode("cppsim")) - model = model.transform(PrepareCppSim()) - model = model.transform(CompileCppSim()) - elif exec_mode == "rtlsim": - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(fpga_part, 5)) - model = model.transform(HLSSynthIP()) - model = model.transform(PrepareRTLSim()) - else: - raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow") - - # prepare input data - input_dict = prepare_inputs(x) - # execute model - y_produced = oxe.execute_onnx(model, input_dict)["outp"] - golden = make_single_im2col_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - stride=stride, - dilation=dilation, - idt=idt, - ) - y_expected = oxe.execute_onnx(golden, input_dict)["outp"] - - if dw == 0: - assert (y_produced == y_expected).all() - else: - y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd) - y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) - y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) - assert (y_produced == y_expected).all() - - if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("ConvolutionInputGenerator1D")[0] - inst = getCustomOp(node) - cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") - exp_cycles_dict = model.analysis(exp_cycles_per_layer) - exp_cycles = exp_cycles_dict[node.name] - assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) - assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py deleted file mode 100755 index 62b7abe536..0000000000 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.general.im2col import compute_conv_output_dim -from qonnx.transformation.general import GiveUniqueNodeNames -from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model - -import finn.core.onnx_exec as oxe -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode - - -def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - im2col_node = helper.make_node( - "Im2Col", - ["inp"], - ["outp"], - domain="finn.custom_op.general", - stride=[stride_h, stride_w], - kernel_size=[k_h, k_w], - input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), - dilations=[dilation_h, dilation_w], - pad_amount=[0, 0, 0, 0], - pad_value=0, - ) - graph = helper.make_graph( - nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] - ) - - model = qonnx_make_model(graph, producer_name="im2col-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0 -): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator_rtl", - ["inp"], - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ifm_dim_h, ifm_dim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=simd, - M=m, - parallel_window=parallel_window, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=idt.name, - outputDataType=odt.name, - depthwise=dw, - ) - graph = helper.make_graph( - nodes=[SlidingWindow_node], - name="slidingwindow_graph", - inputs=[inp], - outputs=[outp], - ) - - model = qonnx_make_model(graph, producer_name="slidingwindow-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def prepare_inputs(input_tensor): - return {"inp": input_tensor} - - -# input datatype -@pytest.mark.parametrize("idt", [DataType["UINT4"]]) -# kernel size -@pytest.mark.parametrize("k", [[3, 3], [1, 5]]) -# input dimension -@pytest.mark.parametrize("ifm_dim", [[13, 13], [1, 21]]) -# input channels -@pytest.mark.parametrize("ifm_ch", [6]) -# Stride -@pytest.mark.parametrize("stride", [[1, 1], [2, 2]]) -# Dilation -@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]]) -# depthwise -@pytest.mark.parametrize("dw", [0, 1]) -# input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1, 3, 6]) -# parallel_window enable (MMV_out = M*K) -@pytest.mark.parametrize("parallel_window", [0, 1]) -# in/out MMV ("M") -@pytest.mark.parametrize("m", [1]) -# Flip dimensions -@pytest.mark.parametrize("flip", [False]) -@pytest.mark.slow -@pytest.mark.vivado -@pytest.mark.fpgadataflow -def test_fpgadataflow_slidingwindow_rtl( - idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip -): - if flip: - if ( - ifm_dim[0] == ifm_dim[1] - and k[0] == k[1] - and stride[0] == stride[1] - and dilation[0] == dilation[1] - ): - pytest.skip("Dimension flip would have no effect") - k = k[::-1] - ifm_dim = ifm_dim[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - - kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation - kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation - - if simd > ifm_ch: - pytest.skip("SIMD cannot be larger than number of input channels") - if ifm_ch % simd != 0: - pytest.skip("SIMD must divide number of input channels") - if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: - pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") - if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: - pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") - if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1): - pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim") - if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)): - pytest.skip("Not all combinations for stride > k edge case supported in default mode") - if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)): - pytest.skip("Parallel window requires SIMD=C for non-depthwise case") - - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) - ofm_dim = [ofm_dim_h, ofm_dim_w] - - x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) - model = make_single_slidingwindow_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - m=m, - parallel_window=parallel_window, - stride=stride, - dilation=dilation, - idt=idt, - dw=dw, - ) - - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP("xc7z020clg400-1", 5)) - model = model.transform(PrepareRTLSim()) - - # prepare input data - input_dict = prepare_inputs(x) - # execute model - y_produced = oxe.execute_onnx(model, input_dict)["outp"] - golden = make_single_im2col_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - stride=stride, - dilation=dilation, - idt=idt, - ) - y_expected = oxe.execute_onnx(golden, input_dict)["outp"] - - if dw == 0: - assert (y_produced == y_expected).all() - else: - y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd) - y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) - y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) - assert (y_produced == y_expected).all() From f2f56d6466fab50f0bc3e92d9c0dae4faae8cda4 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 18 Jan 2024 09:42:29 +0000 Subject: [PATCH 038/291] [BTS] WIP: custom changes --- .../thresholding_binary_search.py | 232 ++++++++++++++---- 1 file changed, 184 insertions(+), 48 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py index 7d53d81de8..cde0d8dc79 100755 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py @@ -26,14 +26,27 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import math import numpy as np import os +import shutil import warnings +from pyverilator.util.axi_utils import rtlsim_multi_io from qonnx.core.datatype import DataType -from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions +from qonnx.util.basic import ( + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.basic import find_next_power_of_2, get_rtlsim_trace_depth, make_build_dir +from finn.util.basic import ( + find_next_power_of_2, + get_memutil_alternatives, + get_rtlsim_trace_depth, + make_build_dir, + mem_primitives_versal, + pyverilate_get_liveness_threshold_cycles, +) from finn.util.data_packing import ( npy_to_rtlsim_input, pack_innermost_dim_as_hex_string, @@ -255,7 +268,7 @@ def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() def get_number_output_values(self): - return 0 + return np.prod(self.get_folded_output_shape()[:-1]) def get_exp_cycles(self): # Channels/PE * batch size * fmdim * fmdim @@ -305,11 +318,70 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) - def prepare_codegen_rtl_values(self): + def get_all_meminit_filenames(self, abspath=False): + "Return a list of all .dat memory initializer files used for this node" + dat_files = [] + t_path = self.get_nodeattr("code_gen_dir_ipgen") if abspath else "." + pe = self.get_nodeattr("PE") + output_data_type = self.get_nodeattr("outputDataType") # output precision + o_bitwidth = DataType[output_data_type].bitwidth() + for stage in range(o_bitwidth): + for pe_value in range(pe): + thresh_file = t_path + "/%s_threshs_%s_%s.dat" % ( + self.onnx_node.name, + pe_value, + stage, + ) + dat_files.append(thresh_file) + return dat_files + + def prepare_codegen_rtl_values(self, model): """All dictionary values produced in this function are to replace their key value(s) in the RTL template files""" code_gen_dict = {} + # TODO check for sortedness and size here? + # RTL component currently always expects 2^N-1 thresholds, but + # sometimes we have fewer due to e.g. narrow range quantization + thresholds = model.get_initializer(self.onnx_node.input[1]) + # add dummy dimension as final dimension (that's what gets packed with next call) + thresholds = np.expand_dims(thresholds, axis=-1) + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 4) + t_packed = pack_innermost_dim_as_hex_string( + thresholds, + wdt, + bw_hexdigit, + prefix="", + ) + + t_path = self.get_nodeattr("code_gen_dir_ipgen") + pe = self.get_nodeattr("PE") + output_data_type = self.get_nodeattr("outputDataType") # output precision + o_bitwidth = DataType[output_data_type].bitwidth() + num_channels = self.get_nodeattr("NumChannels") # number of channels + + channel_fold = int(num_channels / pe) + + for stage in range(o_bitwidth): + sn = o_bitwidth - stage - 1 + for pe_value in range(pe): + thresh_file = t_path + "/%s_threshs_%s_%s.dat" % ( + self.onnx_node.name, + pe_value, + stage, + ) + threshs = np.zeros([channel_fold * (2**stage)], dtype="object") + for ch in range(channel_fold): + for i in range(2**stage): + threshs[(ch << stage) + i] = t_packed[ch * pe + pe_value][ + (i << (o_bitwidth - stage)) + 2**sn - 1 + ] + with open(thresh_file, "w") as f: + for val in threshs: + f.write(val + "\n") + code_gen_dict["$THRESHOLDS_PATH$"] = ['"./%s_"' % self.onnx_node.name] + # Identify the module name code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ self.get_verilog_top_module_name() + "_axi_wrapper" @@ -318,19 +390,13 @@ def prepare_codegen_rtl_values(self): code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] # Identify the module variables - output_data_type = self.get_nodeattr("outputDataType") # output precision - input_data_type = self.get_nodeattr( - "inputDataType" - ) # input/threshold precision - num_channels = self.get_nodeattr("NumChannels") # number of channels + input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision bias = self.get_nodeattr("activation_bias") # activation bias value - pe = self.get_nodeattr("PE") + i_bitwidth = DataType[input_data_type].bitwidth() - code_gen_dict["$N$"] = [ - str(DataType[output_data_type].bitwidth()) - ] # output precision - convert bitwidth to string + code_gen_dict["$N$"] = [str(o_bitwidth)] # output precision - convert bitwidth to string code_gen_dict["$M$"] = [ - str(DataType[input_data_type].bitwidth()) + str(i_bitwidth) ] # input/threshold precision - convert bitwidth to string code_gen_dict["$C$"] = [str(num_channels)] # number of channels code_gen_dict["$BIAS$"] = [str(bias)] # activation bias value @@ -343,11 +409,34 @@ def prepare_codegen_rtl_values(self): else: code_gen_dict["$SIGNED$"] = [str(0)] + if bias >= 0: + o_bits = math.ceil(math.log2(2**o_bitwidth + bias)) + else: + o_bits = 1 + math.ceil( + math.log2(-bias if -bias >= 2 ** (o_bitwidth - 1) else 2**o_bitwidth + bias) + ) + + code_gen_dict["$O_BITS$"] = [str(int(o_bits))] + + rt_weights = self.get_nodeattr("runtime_writeable_weights") + code_gen_dict["$USE_AXILITE$"] = [str(rt_weights)] + + depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") + depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") + deep_pipeline = self.get_nodeattr("deep_pipeline") + code_gen_dict["$DEPTH_TRIGGER_URAM$"] = [str(depth_trigger_uram)] + code_gen_dict["$DEPTH_TRIGGER_BRAM$"] = [str(depth_trigger_bram)] + code_gen_dict["$DEEP_PIPELINE$"] = [str(deep_pipeline)] return code_gen_dict def get_rtl_file_list(self): """Thresholding binary search RTL file list""" - return ["thresholding.sv", "thresholding_axi.sv", "thresholding_axi_wrapper.v"] + return [ + "axilite_if.v", + "thresholding.sv", + "thresholding_axi.sv", + "thresholding_template_wrapper.v", + ] def get_rtl_file_paths(self): """Get full path of all RTL files""" @@ -372,14 +461,18 @@ def fill_in_rtl_template_data(self, replace_dict, template_data): def dump_rtl_data(self, dest_dir, filename, data): """Dump filled-in-template RTL files for future synthesis step""" + # when generating template files, handle a special case: + # if the filename contains the word "template", replace that + # with the node name to distinguish between instances + filename = filename.replace("template", self.onnx_node.name) with open(os.path.join(dest_dir, filename), "w") as f: f.write(data) return - def generate_hdl(self): + def generate_hdl(self, model): """Prepare HDL files from templates for synthesis""" # Generate a dictionary of values to put in RTL template - code_gen_dict = self.prepare_codegen_rtl_values() + code_gen_dict = self.prepare_codegen_rtl_values(model) # Retrieve the destination directory for the final RTL files code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") @@ -399,7 +492,7 @@ def generate_hdl(self): return def code_generation_ipgen(self, model, fpgapart, clk): - self.generate_hdl() + self.generate_hdl(model) # set ipgen_path and ip_path so that HLS-Synth transformation # and stich_ip transformation do not complain @@ -419,15 +512,20 @@ def prepare_rtlsim(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") verilog_paths = [code_gen_dir] - verilog_files = self.get_rtl_file_list() + verilog_files = [x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list()] + dat_files = self.get_all_meminit_filenames(abspath=True) + single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") + for dat_file in dat_files: + shutil.copy(dat_file, single_src_dir) # build the Verilator emulation library sim = PyVerilator.build( verilog_files, - build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + build_dir=single_src_dir, verilog_path=verilog_paths, trace_depth=get_rtlsim_trace_depth(), top_module_name=self.get_nodeattr("gen_top_module"), + auto_eval=False, ) # save generated lib filename in attribute @@ -450,8 +548,7 @@ def execute_node(self, context, graph): in_ind = 0 for inputs in node.input: # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds + # the second input are the thresholds if in_ind == 0: assert ( str(context[inputs].dtype) == "float32" @@ -480,25 +577,16 @@ def execute_node(self, context, graph): # Create a PyVerilator wrapper of the RTLSim .so sim = self.get_rtlsim() nbits = self.get_instream_width() - inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - - super().reset_rtlsim(sim) - super().toggle_clk(sim) - - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - wei = npy_to_rtlsim_input( - "{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits - ) - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + io_names = self.get_verilog_top_module_intf_names() + istream_name = io_names["s_axis"][0][0] + ostream_name = io_names["m_axis"][0][0] io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"s_axis": []}, + "inputs": {istream_name: inp}, + "outputs": {ostream_name: []}, } self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] + output = io_dict["outputs"][ostream_name] # Manage output data odt = self.get_output_datatype() @@ -507,9 +595,7 @@ def execute_node(self, context, graph): out_npy_path = "{}/output.npy".format(code_gen_dir) out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) # load and reshape output output = np.load(out_npy_path) @@ -518,16 +604,55 @@ def execute_node(self, context, graph): context[node.output[0]] = output return + def hls_sname(self): + """Get the naming convention used by Vitis HLS for stream signals + Example: the TDATA for a stream called "out" would be out_V_TDATA. + """ + # no additional prefix/suffix in interface names since this is an RTL component + return "" + + def rtlsim_multi_io(self, sim, io_dict): + "Run rtlsim for this node, supports multiple i/o streams." + + rtlsim_so = self.get_nodeattr("rtlsim_so") + so_dir = os.path.dirname(os.path.realpath(rtlsim_so)) + olcwd = os.getcwd() + os.chdir(so_dir) + + # signal name prefix + # TODO if the interface names on this component get standardized, + # it won't need its own rtlsim_multi_io variant anymore and can just + # use the base class one + sname = "_" + + trace_file = self.get_nodeattr("rtlsim_trace") + if trace_file == "default": + trace_file = self.onnx_node.name + ".vcd" + num_out_values = self.get_number_output_values() + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + num_out_values, + trace_file=trace_file, + sname=sname, + do_reset=True, + liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + ) + self.set_nodeattr("cycles_rtlsim", total_cycle_count) + os.chdir(olcwd) + def code_generation_ipi(self): """Constructs and returns the TCL commands for node instantiation as an RTL block.""" - cmd = [] - rtl_file_list = self.get_rtl_file_list() + rtl_file_list = [x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list()] code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name + cmd = ["file mkdir %s" % source_target] for rtl_file in rtl_file_list: cmd.append( - "add_files -norecurse %s" % (os.path.join(code_gen_dir, rtl_file)) + "add_files -copy_to %s -norecurse %s" + % (source_target, os.path.join(code_gen_dir, rtl_file)) ) # Create an RTL block, not an IP core (-type ip) @@ -548,8 +673,17 @@ def get_verilog_top_module_intf_names(self): axilite always assumed to be 32 bits and is not tuple (name only). Each block must have at most one aximm and one axilite.""" - intf_names = super().get_verilog_top_module_intf_names() - intf_names["axilite"] = ["s_axilite"] + intf_names = {} + intf_names["clk"] = ["ap_clk"] + intf_names["rst"] = ["ap_rst_n"] + intf_names["s_axis"] = [("in0_V", self.get_instream_width_padded())] + intf_names["m_axis"] = [("out_V", self.get_outstream_width_padded())] + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + if self.get_nodeattr("runtime_writeable_weights") == 1: + intf_names["axilite"] = ["s_axilite"] + return intf_names def get_dynamic_config(self, model, address_stride=1): @@ -566,6 +700,8 @@ def get_dynamic_config(self, model, address_stride=1): config = {} channel_cntr = 0 + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 4) for channel in thresholds: channel_start_addr = channel_cntr * weight_addr_boundary * address_stride weight_cntr = 0 @@ -580,8 +716,8 @@ def get_dynamic_config(self, model, address_stride=1): str( pack_innermost_dim_as_hex_string( [weight], - self.get_weight_datatype(), - self.get_weight_datatype().bitwidth(), + wdt, + bw_hexdigit, ) ), 0, From 72ac0e5493aa499a094573dc9a8e5518cbfb9f4d Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 18 Jan 2024 09:45:43 +0000 Subject: [PATCH 039/291] [BTS] threshold supports other memory modes --- .../transformation/fpgadataflow/convert_to_hls_layers.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index a50cbbaed1..c43f058fac 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -1081,13 +1081,6 @@ def apply(self, model): is_rtl_variant_compatible = True # Perform checks for RTL variant if chosen - if self.use_rtl_variant: - assert self.mem_mode == "decoupled", ( - """%s : RTL Thresholding only supports 'decoupled' memory - mode.""" - % node.name - ) - if self.use_rtl_variant and is_rtl_variant_compatible: new_node = helper.make_node( "Thresholding_Binary_Search", From a3d6b340d33faa6f30a3239f1a95513e5364cfe8 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 18 Jan 2024 09:47:51 +0000 Subject: [PATCH 040/291] [BTS] WIP: memory estimation helpers --- src/finn/util/basic.py | 51 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 5252422dcf..0a6c0b39c9 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -30,6 +30,7 @@ import subprocess import sys import tempfile +from qonnx.util.basic import roundup_to_integer_multiple # test boards test_board_map = ["Pynq-Z1", "KV260_SOM", "ZCU104", "U250"] @@ -76,6 +77,11 @@ alveo_default_platform["U280"] = "xilinx_u280_gen3x16_xdma_1_202211_1" alveo_default_platform["U55C"] = "xilinx_u55c_gen3x16_xdma_3_202210_1" +# Create a joint part map, encompassing other boards too +part_map = {**pynq_part_map, **alveo_part_map} +part_map["VEK280"] = "xcve2802-vsvh1760-2MP-e-S" +part_map["VCK190"] = "xcvc1902-vsva2197-2MP-e-S" + def get_rtlsim_trace_depth(): """Return the trace depth for rtlsim via PyVerilator. Controllable @@ -247,3 +253,48 @@ def find_next_power_of_2(n): # unset rightmost bit n = n & n - 1 return n << 1 + + +mem_primitives_versal = { + "URAM_72x4096": (72, 4096), + "URAM_36x8192": (36, 8192), + "URAM_18x16384": (18, 16384), + "URAM_9x32768": (9, 32768), + "BRAM18_36x512": (36, 512), + "BRAM18_18x1024": (18, 1024), + "BRAM18_9x2048": (9, 2048), + "LUTRAM": (1, 64), +} + + +def get_memutil_alternatives( + req_mem_spec, mem_primitives=mem_primitives_versal, sort_min_waste=True +): + ret = [ + (primitive_name, memutil(req_mem_spec, primitive_spec)) + for (primitive_name, primitive_spec) in mem_primitives.items() + ] + if sort_min_waste: + ret = sorted(ret, key=lambda x: x[1][2]) + return ret + + +def memutil(req_mem_spec, primitive_spec): + """Computes how many instances of a memory primitive are necessary to + implemented a desired memory size, where req_mem_spec is the desired + size and the primitive_spec is the primitve size. The sizes are expressed + as tuples of (mem_width, mem_depth). Returns (primitive_count, efficiency, waste) + where efficiency in range [0,1] indicates how much of the total capacity is + utilized, and waste indicates how many bits of storage are wasted.""" + + req_width, req_depth = req_mem_spec + prim_width, prim_depth = primitive_spec + + match_width = roundup_to_integer_multiple(req_width, prim_width) + match_depth = roundup_to_integer_multiple(req_depth, prim_depth) + count_width = match_width // prim_width + count_depth = match_depth // prim_depth + count = count_depth * count_width + eff = (req_width * req_depth) / (count * prim_width * prim_depth) + waste = (count * prim_width * prim_depth) - (req_width * req_depth) + return (count, eff, waste) From 234e568a268e8d670e3c255b7586068a9b32f28a Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 18 Jan 2024 16:06:20 +0000 Subject: [PATCH 041/291] [Tests] Combine 1D and 2D tests for swg --- .../test_fpgadataflow_convinputgenerator.py | 102 ++---------------- 1 file changed, 11 insertions(+), 91 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 07de85d0b5..1a9a934df1 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -94,21 +94,21 @@ def prepare_inputs(input_tensor): # input datatype -@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT2"]]) +@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["UINT4"]]) # kernel size -@pytest.mark.parametrize("k", [[2, 2], [3, 3]]) +@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]]) # input dimension -@pytest.mark.parametrize("ifm_dim", [[6, 6], [8, 8]]) +@pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]]) # input channels @pytest.mark.parametrize("ifm_ch", [2, 4]) # Stride -@pytest.mark.parametrize("stride", [[1, 1], [2, 2]]) +@pytest.mark.parametrize("stride", [[1, 1], [2, 2], [2, 1]]) # Dilation -@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]]) +@pytest.mark.parametrize("dilation", [[1, 1], [2, 2], [2, 1]]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1, 2]) +@pytest.mark.parametrize("simd", [1, 2, 4]) # depthwise @pytest.mark.parametrize("dw", [0, 1]) # parallel_window enable (MMV_out = M*K) @@ -193,9 +193,13 @@ def test_fpgadataflow_slidingwindow( # set simd inst = getCustomOp(model.graph.node[0]) inst.set_nodeattr("SIMD", simd) - if model.graph.node[0].op_type == "ConvolutionInputGenerator_rtl": + optype = model.graph.node[0].op_type + if optype == "ConvolutionInputGenerator_rtl": inst.set_nodeattr("parallel_window", parallel_window) inst.set_nodeattr("M", m) + if optype == "ConvolutionInputGenerator_hls": + if inst.get_nodeattr("is1D"): + inst.set_nodeattr("parallel_window", parallel_window) if exec_mode == "cppsim": if model.graph.node[0].op_type == "ConvolutionInputGenerator_rtl": @@ -236,87 +240,3 @@ def test_fpgadataflow_slidingwindow( assert exp_cycles != 0 else: assert model.graph.node[0].op_type == "ConvolutionInputGenerator_rtl" - - -# input datatype -@pytest.mark.parametrize("idt", [DataType["INT8"]]) -# kernel size -@pytest.mark.parametrize("k", [[4, 1]]) -# input dimension -@pytest.mark.parametrize("ifm_dim", [[10, 1]]) -# input channels -@pytest.mark.parametrize("ifm_ch", [1, 4]) -# Stride -@pytest.mark.parametrize("stride", [[1, 1], [2, 1]]) -# Dilation -@pytest.mark.parametrize("dilation", [[1, 1], [2, 1]]) -# execution mode -@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) -# input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1, 4]) -# depthwise -@pytest.mark.parametrize("dw", [0, 1]) -# TODO add parallel_window and M option -# implementation style -@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) -@pytest.mark.fpgadataflow -@pytest.mark.slow -@pytest.mark.vivado -def test_fpgadataflow_slidingwindow1d( - idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw, impl_style -): - ofm_dim = int(((ifm_dim - k) / stride) + 1) - - x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch)) - input_dict = prepare_inputs(x) - model = make_single_im2col_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw - ) - y_expected = oxe.execute_onnx(model, input_dict)["outp"] - - model = model.transform(to_hw.InferConvInpGen()) - model.save("model_before.onnx") - # set impl_style - inst = getCustomOp(model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]) - inst.set_nodeattr("preferred_impl_style", impl_style) - model = model.transform(SpecializeLayers()) - # set simd - inst = getCustomOp(model.graph.node[0]) - inst.set_nodeattr("SIMD", simd) - model.save("model_after.onnx") - - if exec_mode == "cppsim": - if impl_style == "rtl": - pytest.skip("cppsim not supported for RTL DWC") - else: - model = model.transform(SetExecMode("cppsim")) - model = model.transform(PrepareCppSim()) - model = model.transform(CompileCppSim()) - elif exec_mode == "rtlsim": - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP("xc7z020clg400-1", 5)) - model = model.transform(HLSSynthIP()) - model = model.transform(PrepareRTLSim()) - else: - raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow") - - # execute model - y_produced = oxe.execute_onnx(model, input_dict)["outp"] - - if dw == 0: - assert (y_produced == y_expected).all() - else: - y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, k * k, ifm_ch // simd, simd) - y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) - y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k) - assert (y_produced == y_expected).all() - - if exec_mode == "rtlsim" and impl_style == "hls": - node = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls")[0] - inst = getCustomOp(node) - cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") - exp_cycles_dict = model.analysis(exp_cycles_per_layer) - exp_cycles = exp_cycles_dict[node.name] - assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) - assert exp_cycles != 0 From 5ae57acfd214b0e96f8b5242b08c33065db0bee4 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 18 Jan 2024 18:32:06 +0000 Subject: [PATCH 042/291] [BTS-Integration] HLS module Placeholder --- src/finn/custom_op/fpgadataflow/__init__.py | 6 +- .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../hls/thresholdingbinarysearch_hls.py | 856 ++++++++++++++++++ .../fpgadataflow/thresholdingbinarysearch.py | 115 +++ ...fpgadataflow_thresholding_binary_search.py | 53 +- 5 files changed, 1028 insertions(+), 4 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/hls/thresholdingbinarysearch_hls.py create mode 100644 src/finn/custom_op/fpgadataflow/thresholdingbinarysearch.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 6fffbcc23d..827a8ea8da 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -54,8 +54,8 @@ from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch -from finn.custom_op.fpgadataflow.thresholding_binary_search import ( - Thresholding_Binary_Search, +from finn.custom_op.fpgadataflow.thresholdingbinarysearch import ( + ThresholdingBinarySearch, ) from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour @@ -71,7 +71,7 @@ custom_op["Pool_Batch"] = Pool_Batch custom_op["FMPadding_Pixel"] = FMPadding_Pixel custom_op["Thresholding_Batch"] = Thresholding_Batch -custom_op["Thresholding_Binary_Search"] = Thresholding_Binary_Search +custom_op["ThresholdingBinarySearch"] = ThresholdingBinarySearch custom_op["VectorVectorActivation"] = VectorVectorActivation custom_op["IODMA"] = IODMA custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index bcf36dad67..36b603102d 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -43,6 +43,7 @@ from finn.custom_op.fpgadataflow.hls.streamingeltwise_hls import StreamingEltwise_hls from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls +from finn.custom_op.fpgadataflow.hls.thresholdingbinarysearch_hls import ThresholdingBinarySearch_hls custom_op = dict() @@ -60,4 +61,5 @@ custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls +custom_op["ThresholdingBinarySearch_hls"] = ThresholdingBinarySearch_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholdingbinarysearch_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholdingbinarysearch_hls.py new file mode 100644 index 0000000000..a782b21800 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/thresholdingbinarysearch_hls.py @@ -0,0 +1,856 @@ +# Copyright (c) 2024, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import textwrap +import warnings +from math import ceil, log2 +from finn.custom_op.fpgadataflow.thresholdingbinarysearch import ThresholdingBinarySearch +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + +# ONNX i/o tensor shape assumptions for Thresholding: +# input 0 is the input tensor, shape (..., NumChannels) +# input 1 is the threshold tensor, shape (NumChannels, n_thres) +# output 0 is the output tensor, shape (..., NumChannels) - same as input +# the ... here can be any shape (representing groups of vectors) + + +class ThresholdingBinarySearch_hls(ThresholdingBinarySearch,HLSBackend): + """Class that corresponds to finn-hls Thresholding_Batch function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(ThresholdingBinarySearch.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def calc_tmem(self): + """Calculates and returns TMEM.""" + mh = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return mh // pe + + def infer_node_datatype(self, model): + pass + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required Threshold_Batch attributes do not exist.""") + + return info_messages + + def bram_estimation(self): + """Calculates BRAM cost if resource set to BRAM""" + style = self.get_nodeattr("ram_style") + P = self.get_nodeattr("PE") + idt = self.get_input_datatype() + A = idt.bitwidth() + tmem = self.calc_tmem() + + if style == "block" and tmem > 1: + return int(ceil(A * P / 16)) * int(ceil(tmem / 1024)) + else: + return 0 + + def lut_estimation(self): + """Calculates LUT cost, taking memory resource type into account""" + # TODO add in/out FIFO contributions + style = self.get_nodeattr("ram_style") + P = self.get_nodeattr("PE") + idt = self.get_input_datatype() + A = idt.bitwidth() + tmem = self.calc_tmem() + # cost of comparators + comparator_cost = A * P + # cost of LUTRAM + if style == "distributed" and tmem > 1: + lutram_cost = P * A * int(ceil(tmem / 64)) + else: + lutram_cost = 0 + # total cost + return comparator_cost + lutram_cost + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_weight_datatype(self): + """Returns FINN DataType of thresholds, here called weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def minimize_accumulator_width(self, model): + "Minimize threshold width ('accumulator width' here due to convention)" + thresholds = model.get_initializer(self.onnx_node.input[1]) + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + min_input = self.get_input_datatype().min() + max_input = self.get_input_datatype().max() + # get range required by threshold values + tdt_min = min(min_input, min_threshold) + tdt_max = max(max_input, max_threshold) + if tdt_min < 0: + if abs(tdt_min) > tdt_max: + tdt = DataType.get_smallest_possible(tdt_min) + else: + tdt = DataType.get_smallest_possible(-tdt_max - 1) + else: + tdt = DataType.get_smallest_possible(tdt_max) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + self.set_nodeattr("weightDataType", tdt.name) + # Update QONNX DataType of tensor for consistency + model.set_tensor_datatype(self.onnx_node.input[1], tdt) + return DataType[self.get_nodeattr("weightDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if self.get_nodeattr("mem_mode") == "decoupled": + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + n_thres_steps = self.get_nodeattr("numSteps") + w_width = pe * wp * n_thres_steps + return w_width + else: + return 0 + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + + def get_ap_int_max_w(self): + temp_value = super().get_ap_int_max_w() + weightstream = self.get_weightstream_width() + return max([weightstream, temp_value]) + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + fold = ich // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [ich]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + # fill in TSrcI + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for unsigned inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ + mh = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + tmem = mh // pe + assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." + assert ( + orig_thres_matrix.ndim == 2 + ), """Threshold matrix dimension is + not as expected (2).""" + n_thres_steps = orig_thres_matrix.shape[1] + assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" + if not self.get_input_datatype().signed(): + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" + ret = orig_thres_matrix + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (mh, 1)) + assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + assert ( + ret.shape[2] == n_thres_steps + ), """Third dimension after distribution of the + rows between PEs is not as expected (n_thres_steps)""" + return ret.reshape(1, pe, tmem, n_thres_steps) + + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights (thresholds) in appropriate + format for this layer. This file can be used for either synthesis or + run-time reconfig of weights. + + Arguments: + + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of {hls_header, decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + + """ + threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) + tdt = self.get_weight_datatype() + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + if weight_file_mode == "hls_header": + # save thresholds in thresh.h + thresholds_hls_code = numpy_to_hls_code( + threshold_tensor, tdt, "thresholds", False, True + ) + # write thresholds into thresh.h + f_thresh = open(weight_file_name, "w") + tdt_hls = tdt.get_hls_datatype_str() + # use binary to export bipolar activations + export_odt = self.get_output_datatype() + if self.get_output_datatype() == DataType["BIPOLAR"]: + export_odt = DataType["BINARY"] + odt_hls = export_odt.get_hls_datatype_str() + f_thresh.write( + "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ + = ".format( + self.calc_tmem(), + self.get_nodeattr("PE"), + threshold_tensor.shape[-1], + tdt_hls, + odt_hls, + self.get_nodeattr("ActVal"), + "comp::less_equal<%s, %s>" % (tdt_hls, tdt_hls), + ) + ) + f_thresh.write(thresholds_hls_code) + f_thresh.close() + elif "decoupled" in weight_file_mode: + # streaming thresholds need to be organized differently + # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps) + decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3)) + # TODO add flips/reversals as needed here + # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps) + pe = self.get_nodeattr("PE") + n_thres_steps = self.get_nodeattr("numSteps") + decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2) + decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps) + decoupled_thres = decoupled_thres.copy() + decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape( + 1, -1, pe * n_thres_steps + ) + decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy() + + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, decoupled_thres) + elif weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** ceil(log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 + # first, pack and ensure padding to 32 bits + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + # split into groups of 8 hex digits (= 32 bits) + words_32b = textwrap.wrap(val, 8) + words_32b.reverse() + for word_32b in words_32b: + f.write(word_32b + "\n") + else: + raise Exception("Decoupled weight export not yet implemented") + else: + raise Exception("Unknown weight_file_mode") + + def generate_params(self, model, path): + code_gen_dir = path + thresholds = model.get_initializer(self.onnx_node.input[1]) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + # save thresholds in thresh.h + weight_filename = "{}/thresh.h".format(code_gen_dir) + self.make_weight_file(thresholds, "hls_header", weight_filename) + elif mem_mode == "decoupled": + # save decoupled weights for cppsim + weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) + self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim) + # also save weights as Verilog .dat file + # This file will be ignored when synthesizing UltraScale memory. + weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) + self.make_weight_file(thresholds, "decoupled_verilog_dat", weight_filename_rtl) + else: + raise Exception("Unrecognized mem_mode") + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for Thresholding_Batch") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + oshape = self.get_normal_output_shape() + assert context[node.output[0]].shape == oshape, """Output shape is not as expected""" + + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] + if self.get_nodeattr("mem_mode") == "const": + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + # TODO check and add whatever missing + def defines(self, var): + numReps = 1 + numInputVectors = list(self.get_nodeattr("numInputVectors")) + total_spatial_size = int(np.prod(numInputVectors)) + + self.code_gen_dict["$DEFINES$"] = [ + """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}\n + #define ImgDim1 {}""".format( + self.get_nodeattr("NumChannels"), + self.get_nodeattr("PE"), + numReps, + total_spatial_size, + ) + ] + if self.get_nodeattr("mem_mode") == "decoupled": + self.code_gen_dict["$DEFINES$"].append( + "#define ActVal1 %d" % self.get_nodeattr("ActVal") + ) + self.code_gen_dict["$DEFINES$"].append( + "#define ThresType1 %s" % self.get_weight_datatype().get_hls_datatype_str() + ) + self.code_gen_dict["$DEFINES$"].append( + "#define NumSteps1 %d" % self.get_nodeattr("numSteps") + ) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + tdt = self.get_weight_datatype() + elem_bits = tdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = tdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/thresholds.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, ImgDim1);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + tmpl_args = self.get_template_param_values() + node = self.onnx_node + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} + (in0_{}, out_{}, threshs, numReps);""".format( + node.op_type, + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + self.hls_sname(), + self.hls_sname(), + ) + ] + elif mem_mode == "decoupled": + # note that numReps is set to 1 in the invocation below, since + # - for cppsim the repetition comes from the threshold stream reader+input + # - for synth the unit runs continuously anyway (ap_ctrl_none) + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} + (in0_{}, out_{}, weights_{}, numReps);""".format( + "Thresholding_Stream_Batch", + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + ) + ] + else: + raise Exception("Unrecognized mem_mode") + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + if self.get_nodeattr("mem_mode") == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + elif self.get_nodeattr("mem_mode") == "decoupled": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_weightstream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + else: + raise Exception("Unrecognized mem_mode") + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + if self.get_nodeattr("mem_mode") == "const": + # the threshold tensor is acc_type [PE][TMEM][N_THRES] + # partition for parallel access along PE and N_THRES + # dimensions (dims 1 and 3) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") + ) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") + ) + # set resource type + ram_style = self.get_nodeattr("ram_style") + pe = self.get_nodeattr("PE") + ich = self.get_nodeattr("NumChannels") + # if PE less than NumChannels, assign cores according to ram_style; + # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs + if pe < ich: + if ram_style == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") + ) + elif ram_style == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") + ) + else: + raise Exception( + """Invalid value for attribute ram_style! Is currently set to: {} + has to be set to one of ("block", "distributed")""".format( + ram_style + ) + ) + elif self.get_nodeattr("mem_mode") == "decoupled": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() + ) + + def code_generation_ipi(self): + cmd = [] + # add streamer if needed + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + node_name = self.onnx_node.name + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + sname = self.hls_sname() + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP + strm_vlnv = "amd.com:finn:memstream:1.0" + strm_inst = node_name + "_wstrm" + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst) + ) + cmd.append( + "set_property -dict [list " + "CONFIG.DEPTH {%d} " + "CONFIG.WIDTH {%d} " + "CONFIG.INIT_FILE {%s} " + "CONFIG.RAM_STYLE {%s} " + "] [get_bd_cells /%s/%s]" + % ( + self.calc_tmem(), + self.get_weightstream_width_padded(), + self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat", + self.get_nodeattr("ram_style"), + node_name, + strm_inst, + ) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " + "[get_bd_intf_pins %s/%s/weights_%s]" + % (node_name, strm_inst, node_name, node_name, sname) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]" + % (node_name, rst_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]" + % (node_name, clk_name, node_name, strm_inst) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, rst_name, node_name, node_name, rst_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" + % (node_name, clk_name, node_name, node_name, clk_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, din_name, node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, dout_name, node_name, node_name, dout_name) + ) + if runtime_writable: + # expose axi lite interface for writeable weights + axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/%s] " + "[get_bd_intf_pins %s/%s/%s]" + % (node_name, axilite_name, node_name, strm_inst, axilite_name) + ) + # TODO calculate and pass in segment size here + cmd.append("assign_bd_address") + cmd.append("save_bd_design") + elif mem_mode == "const": + # base class impl sufficient for const mode + return super().code_generation_ipi() + else: + raise Exception("Unrecognized mem_mode for Thresholding_Batch") + return cmd + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def get_op_and_param_counts(self): + ret_dict = {} + weight_bits = self.get_weight_datatype().bitwidth() + out_features = self.get_nodeattr("NumChannels") + num_steps = self.get_nodeattr("numSteps") + # thresholds are called weights in this layer + thres_param_type = "param_threshold_%db" % (weight_bits) + thres_count = out_features * num_steps + ret_dict[thres_param_type] = thres_count + return ret_dict + + def ipgen_extra_directives(self): + "Return a list of extra tcl directives for HLS synthesis." + + return ["config_compile -pipeline_style frp"] + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_tmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/thresholdingbinarysearch.py b/src/finn/custom_op/fpgadataflow/thresholdingbinarysearch.py new file mode 100644 index 0000000000..3d919d3c6e --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/thresholdingbinarysearch.py @@ -0,0 +1,115 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class ThresholdingBinarySearch(HWCustomOp): + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # parallelization; channels thresholded per cycle + "PE": ("i", True, 0), + # number of channels (each may have different thresholds) + "NumChannels": ("i", True, 0), + # number of steps in thresholding function. Used only in decoupled mode + "numSteps": ("i", True, 1), + # FINN DataTypes for inputs, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + # name of the top module in verilog template. Used by PyVerilator + # and IPI generation + "gen_top_module": ("s", False, ""), + # bias to be applied to outputs of the node + "activation_bias": ("i", False, 0), + # whether weights (thresholds) will be + # writable through an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # memory depth triggers for threshold storage + "depth_trigger_uram": ("i", False, 0), + "depth_trigger_bram": ("i", False, 0), + # enable uniform thres optimization + # doesn't actually do anything yet, only + # for resource estimations + "uniform_thres": ("i", False, 0, {0, 1}), + # enable deep pipelining for easier timing closure + # setting to 0 may save some FFs but otherwise leave on + "deep_pipeline": ("i", False, 1, {0, 1}), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_padded_odim(): + pass + + def get_exp_cycles(): + pass + + def get_normal_input_shape(): + pass + + def get_normal_output_shape(): + pass + def get_folded_input_shape(): + pass + def get_folded_output_shape(): + pass + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + + def infer_node_datatype(): + pass + def verify_node(): + pass + def get_input_datatype(): + pass + def get_output_datatype(): + pass + def get_instream_width(): + pass + def get_outstream_width(): + pass + def get_number_output_values(): + pass + + def execute_node(self, context, graph): + pass diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 24b60f5ea5..8e6bf5cbe3 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -45,6 +45,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -86,6 +87,7 @@ def convert_np_array_to_standard_data_layout(data): def make_single_thresholding_binary_search_modelwrapper( + impl_style, thresholds, pe, input_data_type, @@ -106,7 +108,7 @@ def make_single_thresholding_binary_search_modelwrapper( node_inp_list = ["inp", "thresh"] Thresholding_node = helper.make_node( - "Thresholding_Binary_Search", + "ThresholdingBinarySearch", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -119,6 +121,7 @@ def make_single_thresholding_binary_search_modelwrapper( outputDataType=output_data_type.name, activation_bias=activation_bias, numInputVectors=num_input_vecs, + preferred_impl_style=impl_style, ) graph = helper.make_graph( nodes=[Thresholding_node], @@ -285,3 +288,51 @@ def write_thresh_config(sim): rtlsim_exec(model, input_dict, pre_hook=config_hook(config)) y_produced = input_dict["outp"] assert (y_produced == y_expected).all() + + +# Test brief: Test basic transforms are working +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_fpgadataflow_thresholding_binary_search_transform(impl_style): + input_data_type = DataType["INT16"] + act = DataType["INT4"] + fold = -1 + num_input_channels = 16 + + # Handle inputs to the test + pe = generate_pe_value(fold, num_input_channels) + num_steps = act.get_num_possible_values() - 1 + + # Generate random, non-decreasing thresholds + thresholds = generate_random_threshold_values( + input_data_type, num_input_channels, num_steps + ) + thresholds = sort_thresholds_increasing(thresholds) + + # Other non-input parameters + num_input_vecs = [1, 2, 2] + output_data_type = act + if output_data_type == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = output_data_type.min() + + # Generate model from input parameters to the test + model = make_single_thresholding_binary_search_modelwrapper( + impl_style, + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, + ) + + model = model.transform(SpecializeLayers()) + # model = model.transform(SetExecMode("rtlsim")) + # model = model.transform(GiveUniqueNodeNames()) + # model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + # model = model.transform(HLSSynthIP()) + # model = model.transform(PrepareRTLSim()) + return \ No newline at end of file From f575b06e97261deb5e21b4b5a76fbaa296f6d622 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 19 Jan 2024 15:36:58 +0000 Subject: [PATCH 043/291] [CustomOp] Initial draft of Pooling layer in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 4 +- .../custom_op/fpgadataflow/hls/__init__.py | 4 +- .../{pool_batch.py => hls/pool_hls.py} | 150 +------------ src/finn/custom_op/fpgadataflow/pool.py | 198 ++++++++++++++++++ .../fpgadataflow/convert_to_hw_layers.py | 188 ++++++++++++++++- .../fpgadataflow/specialize_layers.py | 33 +-- ...ch.py => test_convert_to_hw_pool_batch.py} | 32 +-- 7 files changed, 433 insertions(+), 176 deletions(-) rename src/finn/custom_op/fpgadataflow/{pool_batch.py => hls/pool_hls.py} (66%) create mode 100644 src/finn/custom_op/fpgadataflow/pool.py rename tests/fpgadataflow/{test_convert_to_hls_pool_batch.py => test_convert_to_hw_pool_batch.py} (88%) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 8254083ef7..cc496ddf2c 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -43,7 +43,7 @@ from finn.custom_op.fpgadataflow.labelselect import LabelSelect from finn.custom_op.fpgadataflow.lookup import Lookup from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation -from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch +from finn.custom_op.fpgadataflow.pool import Pool from finn.custom_op.fpgadataflow.streamingdataflowpartition import ( StreamingDataflowPartition, ) @@ -65,7 +65,6 @@ custom_op["MatrixVectorActivation"] = MatrixVectorActivation custom_op["TLastMarker"] = TLastMarker custom_op["StreamingFIFO"] = StreamingFIFO -custom_op["Pool_Batch"] = Pool_Batch custom_op["FMPadding_Pixel"] = FMPadding_Pixel custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["VectorVectorActivation"] = VectorVectorActivation @@ -83,6 +82,7 @@ custom_op["GlobalAccPool"] = GlobalAccPool custom_op["LabelSelect"] = LabelSelect custom_op["Lookup"] = Lookup +custom_op["Pool"] = Pool custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter custom_op["StreamingEltwise"] = StreamingEltwise custom_op["StreamingMaxPool"] = StreamingMaxPool diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index bcf36dad67..7ae7ffa34d 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,6 +37,7 @@ from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls +from finn.custom_op.fpgadataflow.hls.pool_hls import Pool_hls from finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls import ( StreamingDataWidthConverter_hls, ) @@ -57,6 +58,7 @@ custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls custom_op["LabelSelect_hls"] = LabelSelect_hls custom_op["Lookup_hls"] = Lookup_hls +custom_op["Pool_hls"] = Pool_hls custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py similarity index 66% rename from src/finn/custom_op/fpgadataflow/pool_batch.py rename to src/finn/custom_op/fpgadataflow/hls/pool_hls.py index 8c7bc83141..2baaad01a7 100644 --- a/src/finn/custom_op/fpgadataflow/pool_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,11 +30,12 @@ import os from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.pool import Pool from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class Pool_Batch(HLSCustomOp): +class Pool_hls(Pool, HLSBackend): """Class that corresponds to finn-hlslib Pool_batch function. Requires ConvolutionInputGenerator(depthwise == 1) to format its input @@ -54,148 +55,11 @@ class Pool_Batch(HLSCustomOp): """ def get_nodeattr_types(self): - my_attrs = { - "Channels": ("i", True, 0), - "PE": ("i", True, 1), - "KernelSize": ("ints", True, []), - # Function: - # - MaxPool - # - QuantAvgPool - # TODO add support for AvgPool and AccPool - "Function": ("s", True, "", {"MaxPool", "QuantAvgPool"}), - "OutImgDims": ("ints", True, []), - # FINN DataTypes for inputs/outputs - "InputDataType": ("s", True, ""), - "OutputDataType": ("s", True, ""), - "AccumBits": ("i", False, 0), - "Size": ("i", False, 1), - "BatchSize": ("i", False, 1), - } - - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(Pool.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("InputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - fxn = self.get_nodeattr("Function") - odt = DataType[self.get_nodeattr("OutputDataType")] - - if fxn == "MaxPool": - # Same as input - idt = DataType[self.get_nodeattr("InputDataType")] - assert odt == idt, "In datatype must be equal to out datatype for Maxpool" - elif fxn == "QuantAvgPool": - idt = DataType[self.get_nodeattr("InputDataType")] - assert ( - idt.signed() == odt.signed() - ), """QuantAvgPool: Can't mix signed - and unsigned datatypes""" - else: - raise Exception("Pool_Batch doesn't currently support " + fxn) - - return odt - - def get_normal_input_shape(self, ind=0): - ifm_ch = self.get_nodeattr("Channels") - odims = self.get_nodeattr("OutImgDims") - batch_size = self.get_nodeattr("BatchSize") - k = self.get_nodeattr("KernelSize") - k_prod = int(np.prod(k)) - ishape = (batch_size, *odims, k_prod * ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - normal_ishape = list(self.get_normal_input_shape()) - ifm_ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - assert ifm_ch % pe == 0, "PE must divide input channels" - fold = int(normal_ishape[-1] / pe) - folded_ishape = normal_ishape[:-1] + [fold, pe] - return tuple(folded_ishape) - - def get_normal_output_shape(self, ind=0): - ofm_ch = self.get_nodeattr("Channels") - odims = self.get_nodeattr("OutImgDims") - batch_size = self.get_nodeattr("BatchSize") - oshape = (batch_size, *odims, ofm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - normal_oshape = list(self.get_normal_output_shape()) - ifm_ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - assert ifm_ch % pe == 0, "PE must divide input channels" - fold = int(ifm_ch / pe) - folded_oshape = normal_oshape[:-1] + [fold, pe] - return tuple(folded_oshape) - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[1:-1]) - - def get_exp_cycles(self): - # (Channels * kernel * kernel) / PE * odim * odim * batch_size - ifm_ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - k = self.get_nodeattr("KernelSize") - k_prod = int(np.prod(k)) - odims = self.get_nodeattr("OutImgDims") - batch_size = self.get_nodeattr("BatchSize") - exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size - return int(exp_cycles) - - def get_instream_width(self, ind=0): - dt_bits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = int(dt_bits * pe) - return in_width - - def get_outstream_width(self, ind=0): - dt_bits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = int(dt_bits * pe) - return out_width - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # data type stays the same - dtype = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], dtype) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify the number of inputs - if len(self.onnx_node.input) == 1: - info_messages.append("The number of inputs is correct") - else: - info_messages.append("""Pool_Batch needs 1 data input""") - - # check supported function - fnx = self.get_nodeattr("Function") - if fnx in ["MaxPool", "QuantAvgPool"]: - info_messages.append("Attribute Function contains a supported pool function") - else: - info_messages.append("Attribute Function contains an unsupported pool function") - return info_messages - def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "maxpool.h"'] diff --git a/src/finn/custom_op/fpgadataflow/pool.py b/src/finn/custom_op/fpgadataflow/pool.py new file mode 100644 index 0000000000..6a3962e7dd --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/pool.py @@ -0,0 +1,198 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class Pool(HWCustomOp): + """Abstraction layer for HW implementation of Pool. + Requires ConvolutionInputGenerator(depthwise == 1) to format its input + + Input shape (BatchSize,OutImgDim,OutImgDim,TotalKernelSize*Channels) + Output shape (BatchSize,OutImgDim,OutImgDim,Channels) + + Notes: + + * The input shape was chosen to be compatible with im2col (only true when there + is not folding). + * The actual data layout produced by the hlslib kernels is different + for depthwise ops. + + * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE) + + Channels can be folded using PE (SIMD from the input perspective) + """ + + def get_nodeattr_types(self): + my_attrs = { + "Channels": ("i", True, 0), + "PE": ("i", True, 1), + "KernelSize": ("ints", True, []), + # Function: + # - MaxPool + # - QuantAvgPool + # TODO add support for AvgPool and AccPool + "Function": ("s", True, "", {"MaxPool", "QuantAvgPool"}), + "OutImgDims": ("ints", True, []), + # FINN DataTypes for inputs/outputs + "InputDataType": ("s", True, ""), + "OutputDataType": ("s", True, ""), + "AccumBits": ("i", False, 0), + "Size": ("i", False, 1), + "BatchSize": ("i", False, 1), + } + + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("InputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + fxn = self.get_nodeattr("Function") + odt = DataType[self.get_nodeattr("OutputDataType")] + + if fxn == "MaxPool": + # Same as input + idt = DataType[self.get_nodeattr("InputDataType")] + assert odt == idt, "In datatype must be equal to out datatype for Maxpool" + elif fxn == "QuantAvgPool": + idt = DataType[self.get_nodeattr("InputDataType")] + assert ( + idt.signed() == odt.signed() + ), """QuantAvgPool: Can't mix signed + and unsigned datatypes""" + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + return odt + + def get_normal_input_shape(self, ind=0): + ifm_ch = self.get_nodeattr("Channels") + odims = self.get_nodeattr("OutImgDims") + batch_size = self.get_nodeattr("BatchSize") + k = self.get_nodeattr("KernelSize") + k_prod = int(np.prod(k)) + ishape = (batch_size, *odims, k_prod * ifm_ch) + return ishape + + def get_folded_input_shape(self, ind=0): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(normal_ishape[-1] / pe) + folded_ishape = normal_ishape[:-1] + [fold, pe] + return tuple(folded_ishape) + + def get_normal_output_shape(self, ind=0): + ofm_ch = self.get_nodeattr("Channels") + odims = self.get_nodeattr("OutImgDims") + batch_size = self.get_nodeattr("BatchSize") + oshape = (batch_size, *odims, ofm_ch) + return oshape + + def get_folded_output_shape(self, ind=0): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(ifm_ch / pe) + folded_oshape = normal_oshape[:-1] + [fold, pe] + return tuple(folded_oshape) + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[1:-1]) + + def get_exp_cycles(self): + # (Channels * kernel * kernel) / PE * odim * odim * batch_size + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + k = self.get_nodeattr("KernelSize") + k_prod = int(np.prod(k)) + odims = self.get_nodeattr("OutImgDims") + batch_size = self.get_nodeattr("BatchSize") + exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size + return int(exp_cycles) + + def get_instream_width(self, ind=0): + dt_bits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = int(dt_bits * pe) + return in_width + + def get_outstream_width(self, ind=0): + dt_bits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = int(dt_bits * pe) + return out_width + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""Pool_Batch needs 1 data input""") + + # check supported function + fnx = self.get_nodeattr("Function") + if fnx in ["MaxPool", "QuantAvgPool"]: + info_messages.append("Attribute Function contains a supported pool function") + else: + info_messages.append("Attribute Function contains an unsupported pool function") + return info_messages + + def execute_node(self, context, graph): + pass diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 28b7dba9cb..0d3350a06d 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -311,7 +311,7 @@ def apply(self, model): pass_1d = is_1d and (not is_bipolar) pass_2d = (not is_1d) and is_divisable if pass_1d or pass_2d: - # create equivalent StreamingMaxPool_Batch node + # create equivalent StreamingMaxPool node new_node = helper.make_node( "StreamingMaxPool", [mp_input], @@ -804,6 +804,192 @@ def apply(self, model): return (model, graph_modified) +class InferPool(Transformation): + """If kernel_shape > strides, replace Pool layer with with of Im2col + + pool(with kernel_shape == strides), plus Transpose layers to keep the original + data layout.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]: + node_input = node.input[0] + ishape = model.get_tensor_shape(node_input) + node_output = node.output[0] + idt = model.get_tensor_datatype(node_input) + oshape = model.get_tensor_shape(node_output) + # only support 4D input tensors (1D convs need extra dummy dim) + if len(ishape) != 4: + continue + + # extract pool parameters + if node.op_type == "MaxPool": + kh, kw = list(get_by_name(node.attribute, "kernel_shape").ints) + sh, sw = list(get_by_name(node.attribute, "strides").ints) + dlayout = "NCHW" + elif node.op_type == "QuantAvgPool2d": + inst = getCustomOp(node) + # QuantAvgPool2d has a single scalar attribute + # for kernel size and stride (implicit square) + kh = kw = inst.get_nodeattr("kernel") + sh = sw = inst.get_nodeattr("stride") + dlayout = inst.get_nodeattr("data_layout") + elif node.op_type == "MaxPoolNHWC": + inst = getCustomOp(node) + kh, kw = inst.get_nodeattr("kernel_shape") + sh, sw = inst.get_nodeattr("strides") + dlayout = "NHWC" + try: + pad = list(get_by_name(node.attribute, "pads").ints) + except AttributeError: + pad = [0, 0, 0, 0] + + if not idt.is_integer(): + continue + + if (kh < sh) or (kw < sw): + # TODO check/implement swg support + continue + + odt = model.get_tensor_datatype(node_output) + + if dlayout == "NCHW": + _, ifm_ch, ifm_h, ifm_w = ishape + _, ofm_ch, ofm_h, ofm_w = oshape + elif dlayout == "NHWC": + _, ifm_h, ifm_w, ifm_ch = ishape + _, ofm_h, ofm_w, ofm_ch = oshape + else: + raise Exception("Unknown dlayout: " + str(dlayout)) + + # if data layout NCHW, we need transpose nodes surrounding + # the hls layer + if dlayout == "NCHW": + # create new intermediate values + inp_trans_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ifm_h, ifm_w, ifm_ch), # NHWC + ) + graph.value_info.append(inp_trans_out) + inp_trans_out = inp_trans_out.name + model.set_tensor_datatype(inp_trans_out, idt) + + pool_output = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_h, ofm_w, ofm_ch), + ) + graph.value_info.append(pool_output) + pool_output = pool_output.name + + im2col_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_h, ofm_w, ifm_ch * kh * kw), + ) + graph.value_info.append(im2col_out) + im2col_out = im2col_out.name + model.set_tensor_datatype(im2col_out, idt) + + # create new nodes + if dlayout == "NCHW": + # NCHW -> NHWC + inp_trans_node = helper.make_node( + "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] + ) + im2col_in = inp_trans_out + else: + im2col_in = node_input + pool_output = node_output + + accum_bits = 0 + pool_size_param = 0 # will be overridden if neededs + pad_value = 0 + if node.op_type in ["MaxPool", "MaxPoolNHWC"]: + pool_fxn = "MaxPool" + odt = idt + pad_value = idt.min() + elif node.op_type == "QuantAvgPool2d": + assert odt.is_integer(), """Output data type for QuantAvgPool2d + needs to be integer""" + assert all(x == 0 for x in pad), "Padding is not supported for QuantAvgPool2d" + inst = getCustomOp(node) + pool_fxn = "QuantAvgPool" + pool_size_param = inst.get_shifts() + accum_bits = inst.get_accum_size() + + else: + raise Exception( + "pad_value and pool_fxn not configured for {}".format(node.op_type) + ) + + # format input tensor + im2col_node = helper.make_node( + "Im2Col", + [im2col_in], + [im2col_out], + domain="qonnx.custom_op.general", + stride=[sh, sw], + kernel_size=[kh, kw], + pad_amount=pad, + pad_value=pad_value, + depthwise=1, + input_shape="(1,{},{},{})".format(ifm_h, ifm_w, ifm_ch), + name="Im2Col_" + node.name, + ) + + # Warning PE has to be equal to ifm_ch until Im2Col is replaced by + # ConvolutionInputGenerator with depthwise=1. + # For other settings the output will be incorrect due to incorrect input + # data layout + pool_node = helper.make_node( + "Pool", + [im2col_out], + [pool_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + InputDataType=idt.name, + OutputDataType=odt.name, + Channels=ifm_ch, + PE=ifm_ch, + KernelSize=[kh, kw], + Function=pool_fxn, + OutImgDims=[ofm_h, ofm_w], + AccumBits=accum_bits, + Size=pool_size_param, + BatchSize=1, + name="Pool_" + node.name, + ) + + if dlayout == "NCHW": + # NHWC -> NCHW + out_trans_node = helper.make_node( + "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2] + ) + + # insert nodes where the conv is to preserve topological ordering + if dlayout == "NCHW": + graph.node.insert(node_ind, inp_trans_node) + graph.node.insert(node_ind + 1, im2col_node) + graph.node.insert(node_ind + 2, pool_node) + graph.node.insert(node_ind + 3, out_trans_node) + else: + graph.node.insert(node_ind, im2col_node) + graph.node.insert(node_ind + 1, pool_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferLookupLayer(Transformation): """Convert Gather nodes with constant op0 into Lookup HW layers.""" diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 6c1def628f..31da3756d3 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -166,23 +166,26 @@ def _swg_hls_possible(node): # can only be used for square inputs # and no dilation swg = getCustomOp(node) - # extract all attributes to check - k = swg.get_nodeattr("ConvKernelDim") - ifm_dim = swg.get_nodeattr("IFMDim") - ofm_dim = swg.get_nodeattr("OFMDim") - s = swg.get_nodeattr("Stride") - d = swg.get_nodeattr("Dilation") - # check if square and dilation=1 - if ( - k[0] == k[1] - and ifm_dim[0] == ifm_dim[1] - and ofm_dim[0] == ofm_dim[1] - and s[0] == s[1] - and d[0] == d[1] == 1 - ): + if swg.get_nodeattr("is1D"): return True else: - return False + # extract all attributes to check + k = swg.get_nodeattr("ConvKernelDim") + ifm_dim = swg.get_nodeattr("IFMDim") + ofm_dim = swg.get_nodeattr("OFMDim") + s = swg.get_nodeattr("Stride") + d = swg.get_nodeattr("Dilation") + # check if square and dilation=1 + if ( + k[0] == k[1] + and ifm_dim[0] == ifm_dim[1] + and ofm_dim[0] == ofm_dim[1] + and s[0] == s[1] + and d[0] == d[1] == 1 + ): + return True + else: + return False class SpecializeLayers(Transformation): diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py similarity index 88% rename from tests/fpgadataflow/test_convert_to_hls_pool_batch.py rename to tests/fpgadataflow/test_convert_to_hw_pool_batch.py index 417b4fbae2..442f0a913f 100644 --- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py +++ b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -38,7 +38,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -46,6 +46,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt, use_1d=False): @@ -133,7 +134,7 @@ def prepare_inputs(input_tensor): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_pool_batch(idt, odt, pool_config, ifm_ch, pe, op_type, exec_mode): +def test_convert_to_hw_pool(idt, odt, pool_config, ifm_ch, pe, op_type, exec_mode): k, stride, pad, ifm_dim = pool_config if ifm_ch % pe != 0: @@ -156,10 +157,6 @@ def test_convert_to_hls_pool_batch(idt, odt, pool_config, ifm_ch, pe, op_type, e # prepare input data input_dict = prepare_inputs(x) if op_type == "MaxPool": - # if idt.signed(): - # pytest.skip("""No support for signed input (see accu initialization - # in Pool_batch HLSLIB function). Skipping""") - if idt != odt: pytest.skip("Skipping Maxpool with idt != odt") @@ -178,16 +175,23 @@ def test_convert_to_hls_pool_batch(idt, odt, pool_config, ifm_ch, pe, op_type, e y_expected = oxe.execute_onnx(model, input_dict)["outp"] - new_model = model.transform(to_hls.InferPool_Batch()) + new_model = model.transform(to_hw.InferPool()) new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(to_hw.InferConvInpGen()) + # to test cppsim, set preferred_impl_style for swg to hls + inst = getCustomOp(new_model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]) + inst.set_nodeattr("preferred_impl_style", "hls") + if pad != 0: + inst = getCustomOp(new_model.get_nodes_by_op_type("FMPadding")[0]) + inst.set_nodeattr("preferred_impl_style", "hls") + new_model = new_model.transform(SpecializeLayers()) - new_model = new_model.transform(to_hls.InferConvInpGen()) # Folding for n in new_model.graph.node: if n.op_type.startswith("ConvolutionInputGenerator"): inst = getCustomOp(n) inst.set_nodeattr("SIMD", pe) - elif n.op_type == "Pool_Batch": + elif n.op_type.startswith("Pool"): inst = getCustomOp(n) inst.set_nodeattr("PE", pe) @@ -196,14 +200,14 @@ def test_convert_to_hls_pool_batch(idt, odt, pool_config, ifm_ch, pe, op_type, e assert len(new_model.graph.node) == 4 assert new_model.graph.node[0].op_type == "Transpose" assert new_model.graph.node[1].op_type.startswith("ConvolutionInputGenerator") - assert new_model.graph.node[2].op_type == "Pool_Batch" + assert new_model.graph.node[2].op_type.startswith("Pool") assert new_model.graph.node[3].op_type == "Transpose" else: assert len(new_model.graph.node) == 5 assert new_model.graph.node[0].op_type == "Transpose" - assert new_model.graph.node[1].op_type == "FMPadding_Batch" + assert new_model.graph.node[1].op_type.startswith("FMPadding") assert new_model.graph.node[2].op_type.startswith("ConvolutionInputGenerator") - assert new_model.graph.node[3].op_type == "Pool_Batch" + assert new_model.graph.node[3].op_type.startswith("Pool") assert new_model.graph.node[4].op_type == "Transpose" else: # not currently converted to HLS, node stays as-is @@ -230,7 +234,7 @@ def test_convert_to_hls_pool_batch(idt, odt, pool_config, ifm_ch, pe, op_type, e assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("Pool_Batch")[0] + node = new_model.get_nodes_by_op_type("Pool_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) From 6519986b2ce7ae574f55d97b4597d6617fab1d03 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 19 Jan 2024 16:19:51 +0000 Subject: [PATCH 044/291] [CustomOp] Initial draft of Concat layer in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 2 +- src/finn/custom_op/fpgadataflow/concat.py | 257 +-------------- .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../custom_op/fpgadataflow/hls/concat_hls.py | 295 ++++++++++++++++++ .../fpgadataflow/convert_to_hw_layers.py | 62 +++- .../fpgadataflow/test_fpgadataflow_concat.py | 16 +- 6 files changed, 381 insertions(+), 253 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/hls/concat_hls.py diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index cc496ddf2c..476489a26e 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -70,7 +70,6 @@ custom_op["VectorVectorActivation"] = VectorVectorActivation custom_op["IODMA"] = IODMA custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition -custom_op["StreamingConcat"] = StreamingConcat custom_op["CheckSum"] = CheckSum custom_op["FMPadding"] = FMPadding @@ -83,6 +82,7 @@ custom_op["LabelSelect"] = LabelSelect custom_op["Lookup"] = Lookup custom_op["Pool"] = Pool +custom_op["StreamingConcat"] = StreamingConcat custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter custom_op["StreamingEltwise"] = StreamingEltwise custom_op["StreamingMaxPool"] = StreamingMaxPool diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py index 8c24dadbeb..210b6b7fdd 100644 --- a/src/finn/custom_op/fpgadataflow/concat.py +++ b/src/finn/custom_op/fpgadataflow/concat.py @@ -1,4 +1,5 @@ # Copyright (c) 2021, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,16 +28,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np -import os from qonnx.core.datatype import DataType from qonnx.util.basic import roundup_to_integer_multiple -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class StreamingConcat(HLSCustomOp): - """Streaming concatenation node with dynamically generated HLS. +class StreamingConcat(HWCustomOp): + """Abstraction layer for HW implementation of Concat. Only supports concatenating along the last axis.""" def __init__(self, onnx_node, **kwargs): @@ -127,251 +126,13 @@ def get_number_output_values(self): def get_exp_cycles(self): return np.prod(self.get_folded_output_shape()[:-1]) - def generate_params(self, model, path): - elems_per_stream = self.get_nodeattr("ElemsPerStream") - inp_streams = [] - commands = [] - idt = self.get_input_datatype() - total_elems = self.get_total_elems() - total_bw = idt.bitwidth() * total_elems - for i, elems in enumerate(elems_per_stream): - bw = idt.bitwidth() * elems - inp_stream = "hls::stream > &in%d" % (bw, i) - inp_streams.append(inp_stream) - cmd = "in%d.read()" % i - commands.append(cmd) - out_stream = "hls::stream > &out" % (total_bw) - inp_streams.append(out_stream) - - impl_hls_code = [] - impl_hls_code.append("void StreamingConcat(") - impl_hls_code.append(",".join(inp_streams)) - impl_hls_code.append(", unsigned int numReps) {") - impl_hls_code.append("for(unsigned int i = 0; i < numReps; i++) {") - impl_hls_code.append("#pragma HLS PIPELINE II=1") - impl_hls_code.append("ap_uint<%d> out_elem;" % total_bw) - # FIXME: the order of streams for concatenation works out differently - # for cppsim vs rtlsim, addressed via reversing the order of commands - # for now - impl_hls_code.append("#ifdef __SYNTHESIS__") - impl_hls_code.append("out_elem = (" + ",".join(commands[::-1]) + ");") - impl_hls_code.append("#else") - impl_hls_code.append("out_elem = (" + ",".join(commands) + ");") - impl_hls_code.append("#endif") - impl_hls_code.append("out.write(out_elem);") - impl_hls_code.append("}") - impl_hls_code.append("}") - impl_hls_code = "\n".join(impl_hls_code) - - impl_filename = "{}/concat_impl.hpp".format(path) - f_impl = open(impl_filename, "w") - f_impl.write(impl_hls_code) - f_impl.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") node = self.onnx_node - n_inps = len(self.onnx_node.input) - ishapes = [self.get_normal_input_shape(x) for x in range(n_inps)] - folded_ishapes = [self.get_folded_input_shape(x) for x in range(n_inps)] - exp_oshape = self.get_normal_output_shape() - folded_oshape = self.get_folded_output_shape() - export_idt = self.get_input_datatype() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - for i in range(n_inps): - inp = context[node.input[i]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == ishapes[i], "Input shape mismatch for " + node.input[i] - # reshape input into folded form - inp = inp.reshape(folded_ishapes[i]) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_%d.npy" % i), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == folded_oshape - ), "cppsim did not produce expected folded output shape" - context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) - elif mode == "rtlsim": - sim = self.get_rtlsim() - io_dict = {"inputs": {}, "outputs": {"out": []}} - for i in range(n_inps): - nbits = self.get_instream_width(i) - rtlsim_inp = npy_to_rtlsim_input( - "%s/input_%d.npy" % (code_gen_dir, i), - export_idt, - nbits, - reverse_inner=True, - ) - io_dict["inputs"]["in%d" % i] = rtlsim_inp - super().reset_rtlsim(sim) - super().toggle_clk(sim) - - self.rtlsim_multi_io(sim, io_dict) - rtlsim_output = io_dict["outputs"]["out"] - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, - out_npy_path, - odt, - out_shape, - packed_bits, - target_bits, - reverse_inner=True, - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "concat_impl.hpp"'] - - def defines(self, var): - num_reps = self.get_nodeattr("numInputVectors") - num_reps = np.prod(num_reps) - self.code_gen_dict["$DEFINES$"] = ["#define NumReps %d" % num_reps] - - def read_npy_data(self): - n_inputs = self.get_n_inputs() - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - npy_type = "float" - self.code_gen_dict["$READNPYDATA$"] = [] - idt = self.get_input_datatype() - idt_bw = idt.bitwidth() - elem_hls_type = idt.get_hls_datatype_str() - elem_bits = idt_bw - for i in range(n_inputs): - packed_bits = self.get_instream_width(i) - packed_hls_type = "ap_uint<%d>" % packed_bits - npy_in = "%s/input_%d.npy" % (code_gen_dir, i) - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in%d_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - i, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - n_inputs = self.get_n_inputs() - for i in range(n_inputs): - packed_bits = self.get_instream_width(i) - packed_hls_type = "ap_uint<%d>" % packed_bits - stream_name = "in%d_%s" % (i, self.hls_sname()) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream<%s> %s ("%s");' % (packed_hls_type, stream_name, stream_name) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - self.code_gen_dict["$DOCOMPUTE$"] = [] - n_inputs = self.get_n_inputs() - in_streams = [] - for i in range(n_inputs): - in_streams.append("in%d_%s" % (i, self.hls_sname())) - in_stream_names = ",".join(in_streams) - comp_call = "StreamingConcat(%s, out_%s, NumReps);" % ( - in_stream_names, - self.hls_sname(), - ) - self.code_gen_dict["$DOCOMPUTE$"] = [comp_call] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - n_inputs = self.get_n_inputs() - in_streams = [] - for i in range(n_inputs): - iwidth = self.get_instream_width(i) - in_streams.append("hls::stream> &in%d_%s" % (iwidth, i, self.hls_sname())) - in_streams = ",".join(in_streams) - total_width = self.get_input_datatype().bitwidth() * self.get_total_elems() - out_stream = "hls::stream> &out_%s" % ( - total_width, - self.hls_sname(), - ) - blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream) - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls] - - def pragmas(self): - n_inputs = self.get_n_inputs() - pragmas = [] - for i in range(n_inputs): - pragmas.append("#pragma HLS INTERFACE axis port=in%d_%s" % (i, self.hls_sname())) - self.code_gen_dict["$PRAGMAS$"] = pragmas - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + inp_values = [] + for inp in node.input: + inp_values.append(context[inp]) + result = np.concatenate(inp_values, axis=-1) + context[node.output[0]] = result def get_instream_width_padded(self, ind=0): in_width = self.get_instream_width(ind) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 7ae7ffa34d..b4aae1ef3a 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -28,6 +28,7 @@ from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls +from finn.custom_op.fpgadataflow.hls.concat_hls import StreamingConcat_hls from finn.custom_op.fpgadataflow.hls.convolutioninputgenerator_hls import ( ConvolutionInputGenerator_hls, ) @@ -59,6 +60,7 @@ custom_op["LabelSelect_hls"] = LabelSelect_hls custom_op["Lookup_hls"] = Lookup_hls custom_op["Pool_hls"] = Pool_hls +custom_op["StreamingConcat_hls"] = StreamingConcat_hls custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py new file mode 100644 index 0000000000..f608b343f6 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py @@ -0,0 +1,295 @@ +# Copyright (c) 2021, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.concat import StreamingConcat +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class StreamingConcat_hls(StreamingConcat, HLSBackend): + """Streaming concatenation node with dynamically generated HLS. + Only supports concatenating along the last axis.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingConcat.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def generate_params(self, model, path): + elems_per_stream = self.get_nodeattr("ElemsPerStream") + inp_streams = [] + commands = [] + idt = self.get_input_datatype() + total_elems = self.get_total_elems() + total_bw = idt.bitwidth() * total_elems + for i, elems in enumerate(elems_per_stream): + bw = idt.bitwidth() * elems + inp_stream = "hls::stream > &in%d" % (bw, i) + inp_streams.append(inp_stream) + cmd = "in%d.read()" % i + commands.append(cmd) + out_stream = "hls::stream > &out" % (total_bw) + inp_streams.append(out_stream) + + impl_hls_code = [] + impl_hls_code.append("void StreamingConcat(") + impl_hls_code.append(",".join(inp_streams)) + impl_hls_code.append(", unsigned int numReps) {") + impl_hls_code.append("for(unsigned int i = 0; i < numReps; i++) {") + impl_hls_code.append("#pragma HLS PIPELINE II=1") + impl_hls_code.append("ap_uint<%d> out_elem;" % total_bw) + # FIXME: the order of streams for concatenation works out differently + # for cppsim vs rtlsim, addressed via reversing the order of commands + # for now + impl_hls_code.append("#ifdef __SYNTHESIS__") + impl_hls_code.append("out_elem = (" + ",".join(commands[::-1]) + ");") + impl_hls_code.append("#else") + impl_hls_code.append("out_elem = (" + ",".join(commands) + ");") + impl_hls_code.append("#endif") + impl_hls_code.append("out.write(out_elem);") + impl_hls_code.append("}") + impl_hls_code.append("}") + impl_hls_code = "\n".join(impl_hls_code) + + impl_filename = "{}/concat_impl.hpp".format(path) + f_impl = open(impl_filename, "w") + f_impl.write(impl_hls_code) + f_impl.close() + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + n_inps = len(self.onnx_node.input) + ishapes = [self.get_normal_input_shape(x) for x in range(n_inps)] + folded_ishapes = [self.get_folded_input_shape(x) for x in range(n_inps)] + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + export_idt = self.get_input_datatype() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + for i in range(n_inps): + inp = context[node.input[i]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == ishapes[i], "Input shape mismatch for " + node.input[i] + # reshape input into folded form + inp = inp.reshape(folded_ishapes[i]) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_%d.npy" % i), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + io_dict = {"inputs": {}, "outputs": {"out": []}} + for i in range(n_inps): + nbits = self.get_instream_width(i) + rtlsim_inp = npy_to_rtlsim_input( + "%s/input_%d.npy" % (code_gen_dir, i), + export_idt, + nbits, + reverse_inner=True, + ) + io_dict["inputs"]["in%d" % i] = rtlsim_inp + super().reset_rtlsim(sim) + super().toggle_clk(sim) + + self.rtlsim_multi_io(sim, io_dict) + rtlsim_output = io_dict["outputs"]["out"] + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, + out_npy_path, + odt, + out_shape, + packed_bits, + target_bits, + reverse_inner=True, + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "concat_impl.hpp"'] + + def defines(self, var): + num_reps = self.get_nodeattr("numInputVectors") + num_reps = np.prod(num_reps) + self.code_gen_dict["$DEFINES$"] = ["#define NumReps %d" % num_reps] + + def read_npy_data(self): + n_inputs = self.get_n_inputs() + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + npy_type = "float" + self.code_gen_dict["$READNPYDATA$"] = [] + idt = self.get_input_datatype() + idt_bw = idt.bitwidth() + elem_hls_type = idt.get_hls_datatype_str() + elem_bits = idt_bw + for i in range(n_inputs): + packed_bits = self.get_instream_width(i) + packed_hls_type = "ap_uint<%d>" % packed_bits + npy_in = "%s/input_%d.npy" % (code_gen_dir, i) + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in%d_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + i, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + n_inputs = self.get_n_inputs() + for i in range(n_inputs): + packed_bits = self.get_instream_width(i) + packed_hls_type = "ap_uint<%d>" % packed_bits + stream_name = "in%d_%s" % (i, self.hls_sname()) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<%s> %s ("%s");' % (packed_hls_type, stream_name, stream_name) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [] + n_inputs = self.get_n_inputs() + in_streams = [] + for i in range(n_inputs): + in_streams.append("in%d_%s" % (i, self.hls_sname())) + in_stream_names = ",".join(in_streams) + comp_call = "StreamingConcat(%s, out_%s, NumReps);" % ( + in_stream_names, + self.hls_sname(), + ) + self.code_gen_dict["$DOCOMPUTE$"] = [comp_call] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + n_inputs = self.get_n_inputs() + in_streams = [] + for i in range(n_inputs): + iwidth = self.get_instream_width(i) + in_streams.append("hls::stream> &in%d_%s" % (iwidth, i, self.hls_sname())) + in_streams = ",".join(in_streams) + total_width = self.get_input_datatype().bitwidth() * self.get_total_elems() + out_stream = "hls::stream> &out_%s" % ( + total_width, + self.hls_sname(), + ) + blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream) + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls] + + def pragmas(self): + n_inputs = self.get_n_inputs() + pragmas = [] + for i in range(n_inputs): + pragmas.append("#pragma HLS INTERFACE axis port=in%d_%s" % (i, self.hls_sname())) + self.code_gen_dict["$PRAGMAS$"] = pragmas + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 0d3350a06d..2b8433e59c 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -866,7 +866,7 @@ def apply(self, model): raise Exception("Unknown dlayout: " + str(dlayout)) # if data layout NCHW, we need transpose nodes surrounding - # the hls layer + # the hw layer if dlayout == "NCHW": # create new intermediate values inp_trans_out = helper.make_tensor_value_info( @@ -1043,6 +1043,66 @@ def apply(self, model): return (model, graph_modified) +class InferConcatLayer(Transformation): + """Convert suitable Concat nodes (operating on last/-1 axis) + into StreamingConcat HW layers.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "Concat": + ishape = model.get_tensor_shape(node.input[0]) + axis = get_by_name(node.attribute, "axis") + if (axis is None) or (ishape is None): + continue + axis = axis.i + last_axis = len(ishape) - 1 + # skip conversion if not using last axis + if (axis != -1) and (axis != last_axis): + continue + # check datatype coherence + dt0 = model.get_tensor_datatype(node.input[0]) + if dt0 is None: + continue + dt_coherent = all([model.get_tensor_datatype(x) == dt0 for x in node.input]) + if not dt_coherent: + continue + # skip conversion if any inputs are static + all_static = all([model.get_initializer(x) is None for x in node.input]) + if not all_static: + continue + # skip conversion if inputs are not integers + if not dt0.is_integer(): + continue + # ready for conversion + elems_per_stream = [model.get_tensor_shape(x)[-1] for x in node.input] + inp_vec = list(model.get_tensor_shape(node.input[0])[:-1]) + new_node = helper.make_node( + "StreamingConcat", + node.input, + node.output, + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + name="Concat_" + node.name, + ElemsPerStream=elems_per_stream, + inputDataType=dt0.name, + numInputVectors=inp_vec, + inFIFODepths=[2] * len(node.input), + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + class InferStreamingEltwise(Transformation): """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer with SubEltwise or AbsDiffEltwise op.""" diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py index 2b2069a72b..b4d8a04a95 100644 --- a/tests/fpgadataflow/test_fpgadataflow_concat.py +++ b/tests/fpgadataflow/test_fpgadataflow_concat.py @@ -1,4 +1,5 @@ # Copyright (c) 2021, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,7 +41,7 @@ from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferConcatLayer +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferConcatLayer from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO @@ -48,6 +49,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_concat_model(i_shapes, idt): @@ -90,10 +92,15 @@ def test_fpgadataflow_concat(exec_mode, idt): inp_dict[model.graph.input[i].name] = i_data[i] ret = execute_onnx(model, inp_dict) assert (ret[oname] == exp_out).all() - # call transformation to convert to HLS and verify conversion + # call transformation to convert to HW and verify conversion model = model.transform(InferConcatLayer()) assert model.graph.node[0].op_type == "StreamingConcat" assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow" + ret = execute_onnx(model, inp_dict) + assert (ret[oname] == exp_out).all() + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "StreamingConcat_hls" + assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls" if exec_mode == "cppsim": model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareCppSim()) @@ -130,10 +137,13 @@ def test_fpgadataflow_concat_stitchedip(): inp_dict[model.graph.input[i].name] = i_data[i] ret = execute_onnx(model, inp_dict) assert (ret[oname] == exp_out).all() - # call transformation to convert to HLS and verify conversion + # call transformation to convert to HW and verify conversion model = model.transform(InferConcatLayer()) assert model.graph.node[0].op_type == "StreamingConcat" assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow" + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "StreamingConcat_hls" + assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls" model = model.transform(InsertFIFO(create_shallow_fifos=True)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(fpga_part, clk_ns)) From 89bfc756b9029561664e6df34029482d89deca67 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 23 Jan 2024 16:27:07 +0000 Subject: [PATCH 045/291] [CustomOp] Add execution fct to pool hw layer --- src/finn/custom_op/fpgadataflow/pool.py | 28 ++++++++++++++++++- .../test_convert_to_hw_pool_batch.py | 2 ++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/pool.py b/src/finn/custom_op/fpgadataflow/pool.py index 6a3962e7dd..35aee023b9 100644 --- a/src/finn/custom_op/fpgadataflow/pool.py +++ b/src/finn/custom_op/fpgadataflow/pool.py @@ -195,4 +195,30 @@ def verify_node(self): return info_messages def execute_node(self, context, graph): - pass + # simulate behavior with Python functionality + node = self.onnx_node + fnx = self.get_nodeattr("Function") + k = self.get_nodeattr("KernelSize") + ch = self.get_nodeattr("Channels") + k2 = k[0] * k[1] + + inp_values = context[node.input[0]] + ishape = inp_values.shape + # reshape array to apply max or avg function only on kernel + tmp_shape = tuple(list(ishape)[:-1] + [k2, ch]) + tmp_values = inp_values.reshape(tmp_shape) + if fnx == "MaxPool": + result = np.max(tmp_values, axis=3) + elif fnx == "QuantAvgPool": + # determine bits to shift + ibits = self.get_input_datatype().bitwidth() + obits = self.get_output_datatype().bitwidth() + max_value = 2**ibits - 1 + max_value = max_value * k2 + max_bit_width = int(max_value).bit_length() + shift_bits = max_bit_width - obits + shift_bits = shift_bits if shift_bits >= 0 else 0 + result = np.sum(tmp_values, axis=3) + result = np.right_shift(result.astype(int), shift_bits) + oshape = context[node.output[0]].shape + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/tests/fpgadataflow/test_convert_to_hw_pool_batch.py b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py index 442f0a913f..d532cf345e 100644 --- a/tests/fpgadataflow/test_convert_to_hw_pool_batch.py +++ b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py @@ -184,6 +184,8 @@ def test_convert_to_hw_pool(idt, odt, pool_config, ifm_ch, pe, op_type, exec_mod if pad != 0: inst = getCustomOp(new_model.get_nodes_by_op_type("FMPadding")[0]) inst.set_nodeattr("preferred_impl_style", "hls") + y_produced = oxe.execute_onnx(new_model, input_dict)["outp"] + assert (y_produced == y_expected).all() new_model = new_model.transform(SpecializeLayers()) # Folding From d4c8befda5dc38b2191405df20be6bfe093c46f3 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 24 Jan 2024 11:51:25 +0000 Subject: [PATCH 046/291] [CustomOp] Initial draft of pixel padding layer in new class hierarchy --- .../custom_op/fpgadataflow/fmpadding_pixel.py | 204 ++-------------- .../custom_op/fpgadataflow/hls/__init__.py | 2 + .../fpgadataflow/hls/fmpadding_pixel_hls.py | 228 ++++++++++++++++++ .../fpgadataflow/test_fpgadataflow_deconv.py | 26 +- 4 files changed, 270 insertions(+), 190 deletions(-) create mode 100644 src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py b/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py index bc686bc6d2..b1f9900070 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py +++ b/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, Advanced Micro Devices, Inc. +# Copyright (c) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,15 +28,13 @@ import numpy as np -import os import warnings from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class FMPadding_Pixel(HLSCustomOp): +class FMPadding_Pixel(HWCustomOp): def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -153,183 +151,25 @@ def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] - - def defines(self, var): - odim_h, odim_w = self.get_padded_odim() - stride_h, stride_w = self.get_nodeattr("Stride") - self.code_gen_dict["$DEFINES$"] = [ - """ - #define OutputDim_x {}\n - #define OutputDim_y {}\n - #define Stride_x {}\n - #define Stride_y {}\n - #define NumChannels {}\n - #define SIMD {}\n - """.format( - odim_w, - odim_h, - stride_w, - stride_h, - self.get_nodeattr("NumChannels"), - self.get_nodeattr("SIMD"), - ) - ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - - def docompute(self): - in_t = self.get_input_datatype().get_hls_datatype_str() - odim_h, odim_w = self.get_padded_odim() - stride_h, stride_w = self.get_nodeattr("Stride") - hls_call = "FMPadding_Pixel_Nonsquare" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} (in0, out);""".format( - hls_call, in_t - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" - % (self.onnx_node.name, packed_hls_type, packed_hls_type) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # simulate behavior with Python functionality node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) + s_h, s_w = self.get_nodeattr("Stride") + inp_values = context[node.input[0]] + ishape = inp_values.shape + result = np.zeros( + ( + ishape[0], + ishape[1] + (ishape[1] - 1) * (s_h - 1), + ishape[2] + (ishape[2] - 1) * (s_w - 1), + ishape[3], ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim_H, OutputDim_W, NumChannels).""" + ) + for b in range(ishape[0]): + for h in range(ishape[1]): + for w in range(ishape[2]): + oh = h * s_h + ow = w * s_w + result[b, oh, ow, :] = inp_values[b, h, w, :] + oshape = context[node.output[0]].shape + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index b4aae1ef3a..38cfd73e97 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -35,6 +35,7 @@ from finn.custom_op.fpgadataflow.hls.downsampler_hls import DownSampler_hls from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls +from finn.custom_op.fpgadataflow.hls.fmpadding_pixel_hls import FMPadding_Pixel_hls from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls @@ -56,6 +57,7 @@ custom_op["DownSampler_hls"] = DownSampler_hls custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls custom_op["FMPadding_hls"] = FMPadding_hls +custom_op["FMPadding_Pixel_hls"] = FMPadding_Pixel_hls custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls custom_op["LabelSelect_hls"] = LabelSelect_hls custom_op["Lookup_hls"] = Lookup_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py new file mode 100644 index 0000000000..e1393dc96e --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py @@ -0,0 +1,228 @@ +# Copyright (c) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import numpy as np +import os +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.fmpadding_pixel import FMPadding_Pixel +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class FMPadding_Pixel_hls(FMPadding_Pixel, HLSBackend): + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(FMPadding_Pixel.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + odim_h, odim_w = self.get_padded_odim() + stride_h, stride_w = self.get_nodeattr("Stride") + self.code_gen_dict["$DEFINES$"] = [ + """ + #define OutputDim_x {}\n + #define OutputDim_y {}\n + #define Stride_x {}\n + #define Stride_y {}\n + #define NumChannels {}\n + #define SIMD {}\n + """.format( + odim_w, + odim_h, + stride_w, + stride_h, + self.get_nodeattr("NumChannels"), + self.get_nodeattr("SIMD"), + ) + ] + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' + % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out ("out");'.format(self.get_outstream_width()) + ) + + def docompute(self): + in_t = self.get_input_datatype().get_hls_datatype_str() + odim_h, odim_w = self.get_padded_odim() + stride_h, stride_w = self.get_nodeattr("Stride") + hls_call = "FMPadding_Pixel_Nonsquare" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} (in0, out);""".format( + hls_call, in_t + ) + ] + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" + % (self.onnx_node.name, packed_hls_type, packed_hls_type) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim_H, OutputDim_W, NumChannels).""" diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py index 6c25be0f85..9c333e6808 100644 --- a/tests/fpgadataflow/test_fpgadataflow_deconv.py +++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, Advanced Micro Devices, Inc. +# Copyright (c) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -41,10 +41,7 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import ( - InferConvInpGen, - InferQuantizedMatrixVectorActivation, -) +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferConvInpGen from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.infer_pixel_padding_deconv import ( InferPixelPaddingDeconv, @@ -53,6 +50,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import pynq_part_map test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -162,9 +160,12 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, input_tensor = gen_finn_dt_tensor(idt, [1, ifm_ch, idim_h, idim_w]) input_dict = {"inp": input_tensor} + y_expected = oxe.execute_onnx(ref_model, input_dict)["outp"] + model = ref_model.transform(InferPixelPaddingDeconv()) model = model.transform(InferConvInpGen(use_rtl_variant=convinpgen_rtl)) - model = model.transform(InferQuantizedMatrixVectorActivation()) + # TODO: uncomment when MV(A)U is in new class hierarchy + # model = model.transform(InferQuantizedMatrixVectorActivation()) model = model.transform(InferShapes()) model = model.transform(GiveUniqueNodeNames()) @@ -172,13 +173,21 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, if n.op_type == "ConvolutionInputGenerator" and not convinpgen_rtl: convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", simd) + # to test cppsim, set preferred_impl_style for swg to hls + convinputgen_node.set_nodeattr("preferred_impl_style", "hls") + elif n.op_type == "FMPadding": + pad_node = getCustomOp(n) + pad_node.set_nodeattr("preferred_impl_style", "hls") elif n.op_type == "MatrixVectorActivation": mvau_node = getCustomOp(n) mvau_node.set_nodeattr("PE", pe) mvau_node.set_nodeattr("SIMD", simd) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all() + + model = model.transform(SpecializeLayers()) expected_oshape = (1, ofm_ch, odim_h, odim_w) - y_expected = oxe.execute_onnx(ref_model, input_dict)["outp"] # cppsim if exec_mode == "cppsim": @@ -188,6 +197,7 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, # rtlsim else: + model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) @@ -198,7 +208,7 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("FMPadding_Pixel")[0] + node = model.get_nodes_by_op_type("FMPadding_Pixel_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) From 7ccc72a9a48d3dff0ea6f2c6d56693fbc3020187 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 24 Jan 2024 15:01:29 +0000 Subject: [PATCH 047/291] [CustomOp] Initial draft of checksum layer in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 6 ++---- src/finn/custom_op/fpgadataflow/hls/__init__.py | 2 ++ .../fpgadataflow/{checksum.py => hls/checksum_hls.py} | 9 ++++++--- src/finn/transformation/fpgadataflow/insert_hook.py | 11 ++++++----- tests/fpgadataflow/test_fpgadataflow_checksum.py | 11 ++++++----- 5 files changed, 22 insertions(+), 17 deletions(-) rename src/finn/custom_op/fpgadataflow/{checksum.py => hls/checksum_hls.py} (97%) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 476489a26e..1dd8a6051f 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -29,7 +29,6 @@ from finn.custom_op.fpgadataflow.addstreams import AddStreams from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp -from finn.custom_op.fpgadataflow.checksum import CheckSum from finn.custom_op.fpgadataflow.concat import StreamingConcat from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( ConvolutionInputGenerator, @@ -65,19 +64,18 @@ custom_op["MatrixVectorActivation"] = MatrixVectorActivation custom_op["TLastMarker"] = TLastMarker custom_op["StreamingFIFO"] = StreamingFIFO -custom_op["FMPadding_Pixel"] = FMPadding_Pixel custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["VectorVectorActivation"] = VectorVectorActivation custom_op["IODMA"] = IODMA custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition -custom_op["CheckSum"] = CheckSum -custom_op["FMPadding"] = FMPadding custom_op["AddStreams"] = AddStreams custom_op["ChannelwiseOp"] = ChannelwiseOp custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator custom_op["DownSampler"] = DownSampler custom_op["DuplicateStreams"] = DuplicateStreams +custom_op["FMPadding"] = FMPadding +custom_op["FMPadding_Pixel"] = FMPadding_Pixel custom_op["GlobalAccPool"] = GlobalAccPool custom_op["LabelSelect"] = LabelSelect custom_op["Lookup"] = Lookup diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 38cfd73e97..ad778de01b 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -28,6 +28,7 @@ from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls +from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls from finn.custom_op.fpgadataflow.hls.concat_hls import StreamingConcat_hls from finn.custom_op.fpgadataflow.hls.convolutioninputgenerator_hls import ( ConvolutionInputGenerator_hls, @@ -53,6 +54,7 @@ # registered and plug in correctly into the infrastructure custom_op["AddStreams_hls"] = AddStreams_hls custom_op["ChannelwiseOp_hls"] = ChannelwiseOp_hls +custom_op["CheckSum_hls"] = CheckSum_hls custom_op["ConvolutionInputGenerator_hls"] = ConvolutionInputGenerator_hls custom_op["DownSampler_hls"] = DownSampler_hls custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py similarity index 97% rename from src/finn/custom_op/fpgadataflow/checksum.py rename to src/finn/custom_op/fpgadataflow/hls/checksum_hls.py index 6121c5d97a..23818621b9 100644 --- a/src/finn/custom_op/fpgadataflow/checksum.py +++ b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py @@ -1,4 +1,5 @@ # Copyright (c) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,11 +32,12 @@ import warnings from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class CheckSum(HLSCustomOp): +class CheckSum_hls(HWCustomOp, HLSBackend): """Class that corresponds to custom_hls checksum function.""" def __init__(self, onnx_node, **kwargs): @@ -52,7 +54,8 @@ def get_nodeattr_types(self): # folded shape of input/output "folded_shape": ("ints", True, []), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(HWCustomOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs def make_shape_compatible_op(self, model): diff --git a/src/finn/transformation/fpgadataflow/insert_hook.py b/src/finn/transformation/fpgadataflow/insert_hook.py index 14989efa75..23b60d6812 100644 --- a/src/finn/transformation/fpgadataflow/insert_hook.py +++ b/src/finn/transformation/fpgadataflow/insert_hook.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +38,7 @@ def _is_hook_node(node): - if node.op_type in ["CheckSum"]: + if node.op_type in ["CheckSum_hls"]: return True else: return False @@ -81,7 +82,7 @@ def apply(self, model): if n0_hook in list_supported_hooks: if n0_hook == "checksum": if len(consumers) == 1: - if consumers[0].op_type == "CheckSum": + if consumers[0].op_type == "CheckSum_hls": continue n0_normal_oshape = n0.get_normal_output_shape() n0_folded_oshape = n0.get_folded_output_shape() @@ -99,10 +100,10 @@ def apply(self, model): [1], ) chk_node = oh.make_node( - "CheckSum", + "CheckSum_hls", [output_name], outputs=[chk_otensor.name, chk_result.name], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", words_per_frame=words_per_frame, items_per_word=items_per_word, diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py index 403bb328ae..5cdd99f1e4 100644 --- a/tests/fpgadataflow/test_fpgadataflow_checksum.py +++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py @@ -1,4 +1,5 @@ # Copyright (c) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -151,7 +152,7 @@ def test_fpgadataflow_checksum(): model = model.transform(InferShapes()) assert ( - len(model.get_nodes_by_op_type("CheckSum")) == 2 + len(model.get_nodes_by_op_type("CheckSum_hls")) == 2 ), """Insertion of checksum layers was unsuccessful""" @@ -166,8 +167,8 @@ def test_fpgadataflow_checksum(): model = model.transform(CompileCppSim()) inp = {"global_in": x} y_cppsim = oxe.execute_onnx(model, inp, return_full_exec_context=True) - checksum0_cppsim = y_cppsim["CheckSum_0_out1"] - checksum1_cppsim = y_cppsim["CheckSum_1_out1"] + checksum0_cppsim = y_cppsim["CheckSum_hls_0_out1"] + checksum1_cppsim = y_cppsim["CheckSum_hls_1_out1"] # in this test case scenario the checksums are equal assert checksum0_cppsim == checksum1_cppsim, "CheckSums are not equal" @@ -187,7 +188,7 @@ def test_fpgadataflow_checksum(): def read_checksum_and_drain(sim): chk_addr = 16 drain_addr = 32 - for i in range(len(model.get_nodes_by_op_type("CheckSum"))): + for i in range(len(model.get_nodes_by_op_type("CheckSum_hls"))): axi_name = "s_axi_checksum_{}_".format(i) checksums.append(axilite_read(sim, chk_addr, basename=axi_name)) drain.append(axilite_read(sim, drain_addr, basename=axi_name)) @@ -196,7 +197,7 @@ def read_checksum_and_drain(sim): def write_drain(sim): addr = 32 - for i in range(len(model.get_nodes_by_op_type("CheckSum"))): + for i in range(len(model.get_nodes_by_op_type("CheckSum_hls"))): axi_name = "s_axi_checksum_{}_".format(i) axilite_write(sim, addr, drain_value, basename=axi_name) From 42951dc11504d73d16acdac0b38acc1c0407d00a Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 24 Jan 2024 15:31:35 +0000 Subject: [PATCH 048/291] [CustomOp] Initial draft of iodma in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 2 -- src/finn/custom_op/fpgadataflow/hls/__init__.py | 2 ++ .../fpgadataflow/{iodma.py => hls/iodma_hls.py} | 11 +++++++---- .../transformation/fpgadataflow/insert_iodma.py | 16 ++++++++-------- 4 files changed, 17 insertions(+), 14 deletions(-) rename src/finn/custom_op/fpgadataflow/{iodma.py => hls/iodma_hls.py} (98%) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 1dd8a6051f..ec26b9d5c1 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -38,7 +38,6 @@ from finn.custom_op.fpgadataflow.fmpadding import FMPadding from finn.custom_op.fpgadataflow.fmpadding_pixel import FMPadding_Pixel from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool -from finn.custom_op.fpgadataflow.iodma import IODMA from finn.custom_op.fpgadataflow.labelselect import LabelSelect from finn.custom_op.fpgadataflow.lookup import Lookup from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation @@ -66,7 +65,6 @@ custom_op["StreamingFIFO"] = StreamingFIFO custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["VectorVectorActivation"] = VectorVectorActivation -custom_op["IODMA"] = IODMA custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition custom_op["AddStreams"] = AddStreams diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index ad778de01b..5be16c407a 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -38,6 +38,7 @@ from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls from finn.custom_op.fpgadataflow.hls.fmpadding_pixel_hls import FMPadding_Pixel_hls from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls +from finn.custom_op.fpgadataflow.hls.iodma_hls import IODMA_hls from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls from finn.custom_op.fpgadataflow.hls.pool_hls import Pool_hls @@ -61,6 +62,7 @@ custom_op["FMPadding_hls"] = FMPadding_hls custom_op["FMPadding_Pixel_hls"] = FMPadding_Pixel_hls custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls +custom_op["IODMA_hls"] = IODMA_hls custom_op["LabelSelect_hls"] = LabelSelect_hls custom_op["Lookup_hls"] = Lookup_hls custom_op["Pool_hls"] = Pool_hls diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py similarity index 98% rename from src/finn/custom_op/fpgadataflow/iodma.py rename to src/finn/custom_op/fpgadataflow/hls/iodma_hls.py index bb3de268a0..917ee3798c 100644 --- a/src/finn/custom_op/fpgadataflow/iodma.py +++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +32,8 @@ import warnings from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp # the IODMA inerfaces a memory-mapped AXI interface and an AXI stream # direction "in": pulls data from AXI-MM to AXI stream @@ -72,7 +74,7 @@ # -the folded shape is not defined -class IODMA(HLSCustomOp): +class IODMA(HWCustomOp, HLSBackend): """Class that corresponds to finn-hlslib DMA function(s).""" def __init__(self, onnx_node, **kwargs): @@ -97,7 +99,8 @@ def get_nodeattr_types(self): # name of axi-mm interface "intfName": ("s", False, ""), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(HWCustomOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs def get_normal_input_shape(self, ind=0): diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 90700d5726..93e3226b2a 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -106,7 +106,7 @@ def apply(self, model): graph_in_names = [x.name for x in model.graph.input] for graph_in_name in graph_in_names: first_node = model.find_consumer(graph_in_name) - if first_node.op_type == "IODMA": + if first_node.op_type == "IODMA_hls": # IODMA already inserted for this input continue else: @@ -134,7 +134,7 @@ def apply(self, model): # padding problems for i/o DMA first_node.input[0] = first_node_in.name dma_node = oh.make_node( - "IODMA", + "IODMA_hls", [graph_in_name], [first_node_in.name], numInputVectors=in_folded_shape[:-1], @@ -143,7 +143,7 @@ def apply(self, model): intfWidth=intfwidth, streamWidth=padded_instream_width, direction="in", - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.insert(0, dma_node) @@ -153,7 +153,7 @@ def apply(self, model): graph_out_names = [x.name for x in model.graph.output] for graph_out_name in graph_out_names: final_node = model.find_producer(graph_out_name) - if final_node.op_type == "IODMA": + if final_node.op_type == "IODMA_hls": continue else: out_shape = model.get_tensor_shape(graph_out_name) @@ -180,7 +180,7 @@ def apply(self, model): # FIXME: currently always using 8-bit dtypes to work around the # padding problems for i/o DMA dma_node = oh.make_node( - "IODMA", + "IODMA_hls", [final_node_out.name], [graph_out_name], numInputVectors=out_folded_shape[:-1], @@ -189,7 +189,7 @@ def apply(self, model): intfWidth=intfwidth, streamWidth=padded_outstream_width, direction="out", - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.append(dma_node) @@ -230,7 +230,7 @@ def apply(self, model): model.set_tensor_datatype(fc_node_in.name, w_dtype) model.set_initializer(fc_node_in.name, W) dma_node = oh.make_node( - "IODMA", + "IODMA_hls", [fc_w_name], [fc_node_in.name], numInputVectors=[iodma_mem.shape[0]], @@ -240,7 +240,7 @@ def apply(self, model): streamWidth=streamWidth, direction="in", burstMode="wrap", - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) fc_node.input[1] = fc_node_in.name From 5a4673269abb822c408d039752c60cf799ce9196 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 24 Jan 2024 15:47:33 +0000 Subject: [PATCH 049/291] [CustomOp] Initial draft of tlastmarker in new class hierarchy --- src/finn/custom_op/fpgadataflow/__init__.py | 2 -- src/finn/custom_op/fpgadataflow/hls/__init__.py | 2 ++ .../{tlastmarker.py => hls/tlastmarker_hls.py} | 11 +++++++---- src/finn/transformation/fpgadataflow/floorplan.py | 2 +- .../transformation/fpgadataflow/insert_fifo.py | 2 +- .../fpgadataflow/insert_tlastmarker.py | 14 +++++++------- tests/fpgadataflow/test_fpgadataflow_ipstitch.py | 2 +- 7 files changed, 19 insertions(+), 16 deletions(-) rename src/finn/custom_op/fpgadataflow/{tlastmarker.py => hls/tlastmarker_hls.py} (96%) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index ec26b9d5c1..7ae76f4894 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -52,7 +52,6 @@ from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch -from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation @@ -61,7 +60,6 @@ # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure custom_op["MatrixVectorActivation"] = MatrixVectorActivation -custom_op["TLastMarker"] = TLastMarker custom_op["StreamingFIFO"] = StreamingFIFO custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["VectorVectorActivation"] = VectorVectorActivation diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 5be16c407a..3e31c9785e 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -47,6 +47,7 @@ ) from finn.custom_op.fpgadataflow.hls.streamingeltwise_hls import StreamingEltwise_hls from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls +from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls custom_op = dict() @@ -70,4 +71,5 @@ custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls +custom_op["TLastMarker_hls"] = TLastMarker_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py similarity index 96% rename from src/finn/custom_op/fpgadataflow/tlastmarker.py rename to src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py index 9309841b2e..c2ed06f832 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,10 +27,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class TLastMarker(HLSCustomOp): +class TLastMarker_hls(HWCustomOp, HLSBackend): """Node that adds/removes AXI stream TLAST signals where needed. Its behavior is transparent in node-by-node execution, only visible in IP-stitched rtlsim or actual hardware. @@ -56,7 +58,8 @@ def get_nodeattr_types(self): # Vitis docs recommend using qdma_axis for external, ap_axiu for internal "Protocol": ("s", False, "external", {"external", "internal"}), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(HWCustomOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs def execute_node(self, context, graph): diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index fce2c2264c..b6de086506 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -123,7 +123,7 @@ def apply(self, model): non_dma_nodes = list(filter(lambda x: x not in dma_nodes, df_nodes)) dyn_tlastmarker_nodes = list( filter( - lambda x: x.op_type == "TLastMarker" + lambda x: x.op_type == "TLastMarker_hls" and getCustomOp(x).get_nodeattr("DynIters") == "true", non_dma_nodes, ) diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index f57c9e41b7..8debf6f501 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -236,7 +236,7 @@ def apply(self, model): final_node = model.find_producer(graph_out_name) if final_node.op_type != "StreamingFIFO" and final_node.op_type != "IODMA": assert ( - final_node.op_type != "TLastMarker" + final_node.op_type != "TLastMarker_hls" ), """Insert tlast marker should be done after inserting the FIFOs""" n0 = getCustomOp(final_node) diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 94f0b0eae1..00e5457b52 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -35,7 +35,7 @@ class InsertTLastMarker(Transformation): - """Ensure that the graph is started/terminated with a TLastMarker node, inserting + """Ensure that the graph is started/terminated with a TLastMarker_hls node, inserting one if necessary. Use constructor args to determine type of TLastMarker to be inserted. More information available on the TLastMarker documentation. @@ -52,7 +52,7 @@ def apply(self, model): graph_out_name = model.graph.output[0].name final_node = model.find_producer(graph_out_name) graph_modified = False - if final_node.op_type != "TLastMarker" and not ( + if final_node.op_type != "TLastMarker_hls" and not ( final_node.op_type == "IODMA" and get_by_name(final_node.attribute, "direction").s.decode("UTF-8") == "out" ): @@ -71,7 +71,7 @@ def apply(self, model): # reroute final node output to final_node_out_name final_node.output[0] = final_node_out.name tlast_node = oh.make_node( - "TLastMarker", + "TLastMarker_hls", [final_node_out.name], [graph_out_name], NumIters=num_iters, @@ -80,7 +80,7 @@ def apply(self, model): DynIters=(1 if self.dyniters else 0), Direction="out", Protocol=("external" if self.external else "internal"), - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.append(tlast_node) @@ -109,7 +109,7 @@ def apply(self, model): ): continue # 2. node is either a TLastMarker or an input IODMA - if first_node.op_type != "TLastMarker" and not ( + if first_node.op_type != "TLastMarker_hls" and not ( first_node.op_type == "IODMA" and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") == "in" ): @@ -141,7 +141,7 @@ def apply(self, model): # reroute final node output to first_node_in_name first_node.input[inp_idx] = first_node_in.name tlast_node = oh.make_node( - "TLastMarker", + "TLastMarker_hls", [graph_in_name], [first_node_in.name], NumIters=num_iters, @@ -150,7 +150,7 @@ def apply(self, model): DynIters=(1 if self.dyniters else 0), Direction="in", Protocol=("external" if self.external else "internal"), - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.insert(insert_idx, tlast_node) diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index 2d85cc98f4..aedb151af9 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -209,7 +209,7 @@ def test_fpgadataflow_ipstitch_gen_model(mem_mode): model = model.transform(PrepareIP(test_fpga_part, 5)) model = model.transform(HLSSynthIP()) assert model.graph.node[0].op_type == "MatrixVectorActivation" - assert model.graph.node[-1].op_type == "TLastMarker" + assert model.graph.node[-1].op_type == "TLastMarker_hls" model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model_%s.onnx" % mem_mode) From 161d43feb0f9c414505920e751365f79a17c7381 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 24 Jan 2024 15:52:18 +0000 Subject: [PATCH 050/291] [CustomOp] Rename IODMA to IODMA_hls --- src/finn/custom_op/fpgadataflow/hls/iodma_hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py index 917ee3798c..a0701b8989 100644 --- a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py @@ -74,7 +74,7 @@ # -the folded shape is not defined -class IODMA(HWCustomOp, HLSBackend): +class IODMA_hls(HWCustomOp, HLSBackend): """Class that corresponds to finn-hlslib DMA function(s).""" def __init__(self, onnx_node, **kwargs): From 02c04533fba5421ca68874de9c5cce07b61314d4 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 24 Jan 2024 16:49:00 +0000 Subject: [PATCH 051/291] [Transform] Rename IODMA optype in transformations --- .../fpgadataflow/create_dataflow_partition.py | 2 +- .../transformation/fpgadataflow/create_stitched_ip.py | 2 +- .../transformation/fpgadataflow/externalize_params.py | 2 +- src/finn/transformation/fpgadataflow/floorplan.py | 2 +- src/finn/transformation/fpgadataflow/insert_dwc.py | 2 +- src/finn/transformation/fpgadataflow/insert_fifo.py | 4 ++-- .../transformation/fpgadataflow/insert_tlastmarker.py | 4 ++-- src/finn/transformation/fpgadataflow/make_pynq_driver.py | 8 +++++--- 8 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py index 07d6961be3..f34c6b90af 100644 --- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py +++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py @@ -52,7 +52,7 @@ def __init__(self, partition_model_dir=None): def apply(self, model): def filter_fc_extw(x): - if x.op_type == "IODMA": + if x.op_type == "IODMA_hls": burst_mode = get_by_name(x.attribute, "burstMode") if burst_mode is not None: burst_mode = burst_mode.s.decode("UTF-8") diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 9a653fe404..1a182c7f4f 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -284,7 +284,7 @@ def apply(self, model): ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream") if self.signature: ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/axi_info") - if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]: + if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA_hls"]: warnings.warn( """First node is not StreamingFIFO or IODMA. You may experience incorrect stitched-IP rtlsim or hardware diff --git a/src/finn/transformation/fpgadataflow/externalize_params.py b/src/finn/transformation/fpgadataflow/externalize_params.py index 633db0c553..5e21d8cb2a 100644 --- a/src/finn/transformation/fpgadataflow/externalize_params.py +++ b/src/finn/transformation/fpgadataflow/externalize_params.py @@ -42,7 +42,7 @@ def apply(self, model): graph_modified = False def filter_fc_extw(x): - if x.op_type == "IODMA": + if x.op_type == "IODMA_hls": burst_mode = get_by_name(x.attribute, "burstMode") if burst_mode is not None: burst_mode = burst_mode.s.decode("UTF-8") diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index b6de086506..ceb2bdb5c9 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -119,7 +119,7 @@ def apply(self, model): df_nodes = list( filter(lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes) ) - dma_nodes = list(filter(lambda x: x.op_type == "IODMA", df_nodes)) + dma_nodes = list(filter(lambda x: x.op_type == "IODMA_hls", df_nodes)) non_dma_nodes = list(filter(lambda x: x not in dma_nodes, df_nodes)) dyn_tlastmarker_nodes = list( filter( diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index ee4311a5a1..81cee8dae4 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -47,7 +47,7 @@ def _suitable_node(node): if _is_dwc_node(node): # no DWC for DWCs return False - elif node.op_type == "IODMA": + elif node.op_type == "IODMA_hls": # IODMA data shapes/widths need special handling return False else: diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 8debf6f501..de555d4101 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -182,7 +182,7 @@ def apply(self, model): for graph_in_name in graph_in_names: first_node = model.find_consumer(graph_in_name) # insert FIFO as first node, except when first node is DMA - if first_node.op_type != "StreamingFIFO" and first_node.op_type != "IODMA": + if first_node.op_type != "StreamingFIFO" and first_node.op_type != "IODMA_hls": inp_ind = list(first_node.input).index(graph_in_name) n_input = first_node.input[inp_ind] n0 = getCustomOp(first_node) @@ -234,7 +234,7 @@ def apply(self, model): graph_out_names = [x.name for x in model.graph.output] for graph_out_name in graph_out_names: final_node = model.find_producer(graph_out_name) - if final_node.op_type != "StreamingFIFO" and final_node.op_type != "IODMA": + if final_node.op_type != "StreamingFIFO" and final_node.op_type != "IODMA_hls": assert ( final_node.op_type != "TLastMarker_hls" ), """Insert tlast marker should be done diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 00e5457b52..157df46d71 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -53,7 +53,7 @@ def apply(self, model): final_node = model.find_producer(graph_out_name) graph_modified = False if final_node.op_type != "TLastMarker_hls" and not ( - final_node.op_type == "IODMA" + final_node.op_type == "IODMA_hls" and get_by_name(final_node.attribute, "direction").s.decode("UTF-8") == "out" ): custom_op = getCustomOp(final_node) @@ -110,7 +110,7 @@ def apply(self, model): continue # 2. node is either a TLastMarker or an input IODMA if first_node.op_type != "TLastMarker_hls" and not ( - first_node.op_type == "IODMA" + first_node.op_type == "IODMA_hls" and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") == "in" ): custom_op = getCustomOp(first_node) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 6d1fa290b4..d5c2d8f2b5 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -146,7 +146,7 @@ def apply(self, model): Ensure CreateDataflowPartition called before driver creation.""" first_df_model = ModelWrapper(getCustomOp(i_consumer).get_nodeattr("model")) assert ( - first_df_model.graph.node[0].op_type == "IODMA" + first_df_model.graph.node[0].op_type == "IODMA_hls" ), "First partition must hold input IODMA" successors = model.find_direct_successors(i_consumer) successor_input_num = list(successors[0].input).index(i_consumer.output[0]) @@ -187,7 +187,9 @@ def apply(self, model): ), """ Ensure CreateDataflowPartition called before driver creation.""" df_model = ModelWrapper(getCustomOp(o_producer).get_nodeattr("model")) - assert df_model.graph.node[-1].op_type == "IODMA", "Partition must hold output IODMA" + assert ( + df_model.graph.node[-1].op_type == "IODMA_hls" + ), "Partition must hold output IODMA" predecessors = model.find_direct_predecessors(o_producer) predecessor_output_num = list(predecessors[0].output).index(o_producer.input[0]) predecessor_sdp = getCustomOp(predecessors[0]) @@ -231,7 +233,7 @@ def apply(self, model): sdp_inst = getCustomOp(node) idma_name = sdp_inst.get_nodeattr("instance_name") df_model = ModelWrapper(sdp_inst.get_nodeattr("model")) - assert df_model.graph.node[0].op_type == "IODMA" + assert df_model.graph.node[0].op_type == "IODMA_hls" iodma_node = getCustomOp(df_model.graph.node[0]) if iodma_node.get_nodeattr("burstMode") == "wrap": # input weights dma? init_tensor = df_model.get_initializer(iodma_node.onnx_node.input[0]) From d7104881e83a4b56502264925f32f0b3fbffc801 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 25 Jan 2024 15:49:44 +0000 Subject: [PATCH 052/291] [CustomOp] Initial draft of FIFO in new class hierarchy --- finn-rtllib/fifo/hdl/Q_srl.v | 308 ++++++++++++++++++ finn-rtllib/fifo/hdl/fifo_template.v | 72 ++++ .../custom_op/fpgadataflow/rtl/__init__.py | 2 + .../fpgadataflow/rtl/streamingfifo_rtl.py | 283 ++++++++++++++++ .../custom_op/fpgadataflow/streamingfifo.py | 263 +-------------- src/finn/custom_op/fpgadataflow/templates.py | 46 --- tests/fpgadataflow/test_fpgadataflow_fifo.py | 4 +- 7 files changed, 672 insertions(+), 306 deletions(-) create mode 100644 finn-rtllib/fifo/hdl/Q_srl.v create mode 100644 finn-rtllib/fifo/hdl/fifo_template.v create mode 100644 src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py diff --git a/finn-rtllib/fifo/hdl/Q_srl.v b/finn-rtllib/fifo/hdl/Q_srl.v new file mode 100644 index 0000000000..11cef604e0 --- /dev/null +++ b/finn-rtllib/fifo/hdl/Q_srl.v @@ -0,0 +1,308 @@ +// original source: +// https://github.com/nachiket/tdfc/blob/master/verilog/queues/Q_srl_oreg3_prefull_SIMPLE.v + + +// Copyright (c) 1999 The Regents of the University of California +// Copyright (c) 2010 The Regents of the University of Pennsylvania +// Copyright (c) 2011 Department of Electrical and Electronic Engineering, Imperial College London +// Copyright (c) 2020 Xilinx +// +// Permission to use, copy, modify, and distribute this software and +// its documentation for any purpose, without fee, and without a +// written agreement is hereby granted, provided that the above copyright +// notice and this paragraph and the following two paragraphs appear in +// all copies. +// +// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR +// DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +// LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, +// EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, +// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +// AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON +// AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO +// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +// + +// Q_srl_oreg3_prefull_SIMPLE.v +// +// - In-page queue with parameterizable depth, bit width +// - Stream I/O is triple (data, valid, back-pressure), +// with EOS concatenated into the data +// - Flow control for input & output is combinationally decoupled +// - 2 <= depth <= 256 +// * (depth >= 2) is required to decouple I/O flow control, +// where empty => no produce, full => no consume, +// and depth 1 would ping-pong between the two at half rate +// * (depth <= 256) can be modified +// by changing ''synthesis loop_limit X'' below +// and changing ''addrwidth'' or its log computation +// - 1 <= width +// - Queue storage is in SRL16E, up to depth 16 per LUT per bit-slice, +// plus output register (for fast output) +// - Queue addressing is done by ''addr'' up-down counter +// - Queue fullness is checked by comparator (addr==depth) +// - Queue fullness is pre-computed for next cycle +// - Queue input back-pressure is pre-computed for next cycle +// - Queue output valid (state!=state__empty) is pre-computed for next cycle +// (necessary since SRL data output reg requires non-boolean state) +// - FSM has 3 states (empty, one, more) +// - When empty, continue to emit most recently emitted value (for debugging) +// +// - Queue slots used = / (state==state_empty) ? 0 +// | (state==state_one) ? 1 +// \ (state==state_more) ? addr+2 +// - Queue slots used <= depth +// - Queue slots remaining = depth - used +// = / (state==state_empty) ? depth +// | (state==state_one) ? depth-1 +// \ (state==state_more) ? depth-2-addr +// +// - Synplify 7.1 / 8.0 +// - Eylon Caspi, 9/11/03, 8/18/04, 3/29/05 + + +`ifdef Q_srl +`else +`define Q_srl + + +module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount); + + parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256) + parameter width = 16; // - width of data (i_d, o_d) + + parameter addrwidth = $clog2(depth); + + input clock; + input reset; + + input [width-1:0] i_d; // - input stream data (concat data + eos) + input i_v; // - input stream valid + output i_r; // - input stream ready + wire i_b; // - input stream back-pressure + + output [width-1:0] o_d; // - output stream data (concat data + eos) + output o_v; // - output stream valid + input o_r; // - output stream ready + wire o_b; // - output stream back-pressure + + output [addrwidth:0] count; // - output number of elems in queue + output [addrwidth:0] maxcount; // - maximum observed count since reset + + reg [addrwidth:0] maxcount_reg; // - maximum count seen until now + reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address + // for data output + reg shift_en_; // - SRL16 shift enable + reg [width-1:0] srl [depth-2:0]; // - SRL16 memory + reg shift_en_o_; // - SRLO shift enable + reg [width-1:0] srlo_, srlo // - SRLO output reg + /* synthesis syn_allow_retiming=0 */ ; + + parameter state_empty = 2'd0; // - state empty : o_v=0 o_d=UNDEFINED + parameter state_one = 2'd1; // - state one : o_v=1 o_d=srlo + parameter state_more = 2'd2; // - state more : o_v=1 o_d=srlo + // #items in srl = addr+2 + + reg [1:0] state, state_; // - state register + + wire addr_full_; // - true iff addr==depth-2 on NEXT cycle + reg addr_full; // - true iff addr==depth-2 + wire addr_zero_; // - true iff addr==0 + wire o_v_reg_; // - true iff state_empty on NEXT cycle + reg o_v_reg // - true iff state_empty + /* synthesis syn_allow_retiming=0 */ ; + wire i_b_reg_; // - true iff !full on NEXT cycle + reg i_b_reg // - true iff !full + /* synthesis syn_allow_retiming=0 */ ; + + assign addr_full_ = (state_==state_more) && (addr_==depth-2); + // - queue full + assign addr_zero_ = (addr==0); // - queue contains 2 (or 1,0) + assign o_v_reg_ = (state_!=state_empty); // - output valid if non-empty + assign i_b_reg_ = addr_full_; // - input bp if full + assign o_d = srlo; // - output data from queue + assign o_v = o_v_reg; // - output valid if non-empty + assign i_b = i_b_reg; // - input bp if full + assign maxcount = maxcount_reg; + + assign i_r = !i_b; + assign o_b = !o_r; + + assign count = (state==state_more ? addr+2 : (state==state_one ? 1 : 0)); + + // - ''always'' block with both FFs and SRL16 does not work, + // since FFs need reset but SRL16 does not + + always @(posedge clock) begin // - seq always: FFs + if (reset) begin + state <= state_empty; + addr <= 0; + addr_full <= 0; + o_v_reg <= 0; + + i_b_reg <= 0; + maxcount_reg <= 0; + + end + else begin + state <= state_; + addr <= addr_; + addr_full <= addr_full_; + o_v_reg <= o_v_reg_; + i_b_reg <= i_b_reg_; + maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg); + end + end // always @ (posedge clock) + + always @(posedge clock) begin // - seq always: srlo + // - infer enabled output reg at end of shift chain + // - input first element from i_d, all subsequent elements from SRL16 + if (reset) begin + srlo <= 0; + end + else begin + if (shift_en_o_) begin + srlo <= srlo_; + end + end + end // always @ (posedge clock) + + always @(posedge clock) begin // - seq always: srl + // - infer enabled SRL16E from shifting srl array + // - no reset capability; srl[] contents undefined on reset + if (shift_en_) begin + // synthesis loop_limit 256 + for (a_=depth-2; a_>0; a_=a_-1) begin + srl[a_] = srl[a_-1]; + end + srl[0] <= i_d; + end + end // always @ (posedge clock or negedge reset) + + always @* begin // - combi always + srlo_ <= 'bx; + shift_en_o_ <= 1'bx; + shift_en_ <= 1'bx; + addr_ <= 'bx; + state_ <= 2'bx; + case (state) + + state_empty: begin // - (empty, will not produce) + if (i_v) begin // - empty & i_v => consume + srlo_ <= i_d; + shift_en_o_ <= 1; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_one; + end + else begin // - empty & !i_v => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_empty; + end + end + + state_one: begin // - (contains one) + if (i_v && o_b) begin // - one & i_v & o_b => consume + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1; + addr_ <= 0; + state_ <= state_more; + end + else if (i_v && !o_b) begin // - one & i_v & !o_b => cons+prod + srlo_ <= i_d; + shift_en_o_ <= 1; + shift_en_ <= 1; + addr_ <= 0; + state_ <= state_one; + end + else if (!i_v && o_b) begin // - one & !i_v & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_one; + end + else if (!i_v && !o_b) begin // - one & !i_v & !o_b => produce + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_empty; + end + end // case: state_one + + state_more: begin // - (contains more than one) + if (addr_full || (depth==2)) begin + // - (full, will not consume) + // - (full here if depth==2) + if (o_b) begin // - full & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 0; + addr_ <= addr; + state_ <= state_more; + end + else begin // - full & !o_b => produce + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 0; +// addr_ <= addr-1; +// state_ <= state_more; + addr_ <= addr_zero_ ? 0 : addr-1; + state_ <= addr_zero_ ? state_one : state_more; + end + end + else begin // - (mid: neither empty nor full) + if (i_v && o_b) begin // - mid & i_v & o_b => consume + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1; + addr_ <= addr+1; + state_ <= state_more; + end + else if (i_v && !o_b) begin // - mid & i_v & !o_b => cons+prod + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 1; + addr_ <= addr; + state_ <= state_more; + end + else if (!i_v && o_b) begin // - mid & !i_v & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 0; + addr_ <= addr; + state_ <= state_more; + end + else if (!i_v && !o_b) begin // - mid & !i_v & !o_b => produce + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 0; + addr_ <= addr_zero_ ? 0 : addr-1; + state_ <= addr_zero_ ? state_one : state_more; + end + end // else: !if(addr_full) + end // case: state_more + + default: begin + srlo_ <= 'bx; + shift_en_o_ <= 1'bx; + shift_en_ <= 1'bx; + addr_ <= 'bx; + state_ <= 2'bx; + end // case: default + + endcase // case(state) + end // always @ * + +endmodule // Q_srl + + +`endif // `ifdef Q_srl diff --git a/finn-rtllib/fifo/hdl/fifo_template.v b/finn-rtllib/fifo/hdl/fifo_template.v new file mode 100644 index 0000000000..4c614c83dd --- /dev/null +++ b/finn-rtllib/fifo/hdl/fifo_template.v @@ -0,0 +1,72 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +module $TOP_MODULE_NAME$( +//- Global Control ------------------ +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) +input ap_clk, +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) +input ap_rst_n, + + +output $COUNT_RANGE$ count, +output $COUNT_RANGE$ maxcount, + +//- AXI Stream - Input -------------- +output in0_V_TREADY, +input in0_V_TVALID, +input $IN_RANGE$ in0_V_TDATA, + +//- AXI Stream - Output -------------- +input out_V_TREADY, +output out_V_TVALID, +output $OUT_RANGE$ out_V_TDATA +); + +Q_srl #( +.depth($DEPTH$), +.width($WIDTH$) +) +$TOP_MODULE_NAME$_impl +( + .clock(ap_clk), + .reset(!ap_rst_n), + .count(count), + .maxcount(maxcount), + .i_d(in0_V_TDATA), + .i_v(in0_V_TVALID), + .i_r(in0_V_TREADY), + .o_d(out_V_TDATA), + .o_v(out_V_TVALID), + .o_r(out_V_TREADY) +); + +endmodule diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index ac75371381..914c033584 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -33,6 +33,7 @@ from finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl import ( StreamingDataWidthConverter_rtl, ) +from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl custom_op = dict() @@ -41,3 +42,4 @@ custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl custom_op["FMPadding_rtl"] = FMPadding_rtl custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl +custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py new file mode 100644 index 0000000000..a9d9e689eb --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py @@ -0,0 +1,283 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np +import os +import shutil +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class StreamingFIFO_rtl(StreamingFIFO, RTLBackend): + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # Toggle between rtl or IPI implementation + # rtl - use the rtl generated IP during stitching + # vivado - use the AXI Infrastructure FIFO + "impl_style": ("s", False, "rtl", {"rtl", "vivado"}), + } + my_attrs.update(StreamingFIFO.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + + return my_attrs + + def get_adjusted_depth(self): + impl = self.get_nodeattr("impl_style") + depth = self.get_nodeattr("depth") + if impl == "vivado": + old_depth = depth + # round up depth to nearest power-of-2 + # Vivado FIFO impl may fail otherwise + depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth + if old_depth != depth: + warnings.warn( + "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado" + % (self.onnx_node.name, old_depth, depth) + ) + + return depth + + def get_verilog_top_module_intf_names(self): + ret = super().get_verilog_top_module_intf_names() + is_rtl = self.get_nodeattr("impl_style") == "rtl" + is_depth_monitor = self.get_nodeattr("depth_monitor") == 1 + if is_rtl and is_depth_monitor: + ret["ap_none"] = ["maxcount"] + return ret + + def generate_hdl(self): + rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fifo/hdl" + template_path = rtlsrc + "/fifo_template.v" + + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + topname = self.get_verilog_top_module_name() + self.set_nodeattr("gen_top_module", topname) + + code_gen_dict = {} + code_gen_dict["$TOP_MODULE_NAME$"] = topname + # make instream width a multiple of 8 for axi interface + in_width = self.get_instream_width_padded() + count_width = int(self.get_nodeattr("depth") - 1).bit_length() + code_gen_dict["$COUNT_RANGE$"] = "[{}:0]".format(count_width - 1) + code_gen_dict["$IN_RANGE$"] = "[{}:0]".format(in_width - 1) + code_gen_dict["$OUT_RANGE$"] = "[{}:0]".format(in_width - 1) + code_gen_dict["$WIDTH$"] = str(in_width) + code_gen_dict["$DEPTH$"] = str(self.get_nodeattr("depth")) + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + for key_name in code_gen_dict: + key = "%s" % key_name + template = template.replace(key, str(code_gen_dict[key_name])) + with open( + os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"), + "w", + ) as f: + f.write(template) + + shutil.copy(rtlsrc + "/Q_srl.v", code_gen_dir) + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + inp = context[node.input[0]] + exp_shape = self.get_normal_input_shape() + + if mode == "cppsim": + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + context[node.output[0]] = output + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # create a npy file for the input of the node + assert ( + str(inp.dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = inp.reshape(expected_inp_shape) + if DataType[self.get_nodeattr("dataType")] == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = DataType[self.get_nodeattr("dataType")] + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + output = self.rtlsim(sim, inp) + odt = DataType[self.get_nodeattr("dataType")] + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def code_generation_ipi(self): + impl_style = self.get_nodeattr("impl_style") + if impl_style == "rtl": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + sourcefiles = [ + "Q_srl.v", + self.get_nodeattr("gen_top_module") + ".v", + ] + + sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles] + + cmd = [] + for f in sourcefiles: + cmd += ["add_files -norecurse %s" % (f)] + cmd += [ + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ] + return cmd + elif impl_style == "vivado": + cmd = [] + node_name = self.onnx_node.name + depth = self.get_adjusted_depth() + ram_style = self.get_nodeattr("ram_style") + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate and configure DWC + cmd.append( + "create_bd_cell -type ip " + "-vlnv xilinx.com:ip:axis_data_fifo:2.0 /%s/fifo" % node_name + ) + cmd.append( + "set_property -dict [list CONFIG.FIFO_DEPTH {%d}] " + "[get_bd_cells /%s/fifo]" % (depth, node_name) + ) + cmd.append( + "set_property -dict [list CONFIG.FIFO_MEMORY_TYPE {%s}] " + "[get_bd_cells /%s/fifo]" % (ram_style, node_name) + ) + cmd.append( + "set_property -dict [list CONFIG.TDATA_NUM_BYTES {%d}] " + "[get_bd_cells /%s/fifo]" % (np.ceil(self.get_outstream_width() / 8), node_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/fifo/M_AXIS] " + "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/fifo/S_AXIS] " + "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] " + "[get_bd_pins %s/fifo/s_axis_aresetn]" % (node_name, rst_name, node_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] " + "[get_bd_pins %s/fifo/s_axis_aclk]" % (node_name, clk_name, node_name) + ) + return cmd + else: + raise Exception( + "FIFO implementation style %s not supported, please use rtl or vivado" % impl_style + ) + + def prepare_rtlsim(self): + assert self.get_nodeattr("impl_style") != "vivado", ( + "StreamingFIFO impl_style " + "cannot be vivado for rtlsim. Only impl_style=rtl supported." + ) + # Modified to use generated (System-)Verilog instead of HLS output products + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [ + "Q_srl.v", + self.get_nodeattr("gen_top_module") + ".v", + ] + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index 1249bc1251..950574ba0a 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,23 +27,15 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import math import numpy as np -import os -import subprocess import warnings from qonnx.core.datatype import DataType -from shutil import copy -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.basic import get_finn_root -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -from . import templates - -class StreamingFIFO(HLSCustomOp): +class StreamingFIFO(HWCustomOp): def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) - self.strm_fifo_wrapper = templates.strm_fifo_wrapper def get_nodeattr_types(self): my_attrs = super().get_nodeattr_types() @@ -55,10 +47,6 @@ def get_nodeattr_types(self): "folded_shape": ("ints", True, []), # FINN DataTypes for inputs/outputs "dataType": ("s", True, ""), - # Toggle between hls or IPI implementation - # rtl - use the hls generated IP during stitching - # vivado - use the AXI Infrastructure FIFO - "impl_style": ("s", False, "rtl", {"rtl", "vivado"}), # FPGA resource type for FIFOs when impl_style is vivado # auto -- let Vivado decide # block -- use BRAM @@ -80,22 +68,6 @@ def get_nodeattr_types(self): return my_attrs - def get_adjusted_depth(self): - impl = self.get_nodeattr("impl_style") - depth = self.get_nodeattr("depth") - if impl == "vivado": - old_depth = depth - # round up depth to nearest power-of-2 - # Vivado FIFO impl may fail otherwise - depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth - if old_depth != depth: - warnings.warn( - "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado" - % (self.onnx_node.name, old_depth, depth) - ) - - return depth - def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() oshape = self.get_normal_output_shape() @@ -128,85 +100,6 @@ def get_verilog_top_module_intf_names(self): ret["ap_none"] = ["maxcount"] return ret - def get_verilog_top_module_name(self): - "Return the Verilog top module name for this node." - - node = self.onnx_node - prefixed_top_name = "%s" % (node.name) - return prefixed_top_name - - def code_generation_ipgen(self, model, fpgapart, clk): - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_dir = "{}/project_{}/sol1/impl/verilog".format(code_gen_dir, self.onnx_node.name) - os.makedirs(verilog_dir) - # copy Q_srl.v from finn-rtllib to verilog directory - memstream_dir = get_finn_root() + "/finn-rtllib/memstream/hdl/" - Q_file = os.path.join(memstream_dir, "Q_srl.v") - copy(Q_file, verilog_dir) - - # empty code gen dictionary for new entries - self.code_gen_dict.clear() - self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)] - self.code_gen_dict["$LAYER_NAME$"] = [ - "{}_{}".format(self.onnx_node.name, self.onnx_node.name) - ] - # make instream width a multiple of 8 for axi interface - in_width = self.get_instream_width_padded() - count_width = int(self.get_nodeattr("depth") - 1).bit_length() - self.code_gen_dict["$COUNT_RANGE$"] = ["[{}:0]".format(count_width - 1)] - self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)] - self.code_gen_dict["$OUT_RANGE$"] = ["[{}:0]".format(in_width - 1)] - self.code_gen_dict["$WIDTH$"] = [str(in_width)] - self.code_gen_dict["$DEPTH$"] = [str(self.get_nodeattr("depth"))] - self.code_gen_dict["$HLS_SNAME$"] = [self.hls_sname()] - - template = self.strm_fifo_wrapper - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - f = open(os.path.join(verilog_dir, "{}.v".format(self.onnx_node.name)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - def ipgen_singlenode_code(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_dir = "{}/project_{}/sol1/impl/verilog".format(code_gen_dir, self.onnx_node.name) - # prepare the IP packaging tcl template - template = templates.ip_package_tcl - self.code_gen_dict.clear() - self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)] - # note: setting the root dir as absolute can cause path problems - # the ipgen script will be invoked from the sources dir so root_dir=. is OK - self.code_gen_dict["$VERILOG_DIR$"] = ["."] - self.code_gen_dict["$HLS_SNAME$"] = [self.hls_sname()] - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - f = open(os.path.join(verilog_dir, "package_ip.tcl"), "w") - f.write(template) - f.close() - # create a shell script and call Vivado to invoke the IP pkg script - make_project_sh = verilog_dir + "/make_ip.sh" - working_dir = os.environ["PWD"] - with open(make_project_sh, "w") as f: - f.write("#!/bin/bash \n") - f.write("cd {}\n".format(verilog_dir)) - f.write("vivado -mode batch -source package_ip.tcl\n") - f.write("cd {}\n".format(working_dir)) - bash_command = ["bash", make_project_sh] - process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_compile.communicate() - # set ipgen_path and ip_path to point to the new packaged IP - self.set_nodeattr("ipgen_path", verilog_dir) - self.set_nodeattr("ip_path", verilog_dir) - vlnv = "xilinx.com:hls:%s:1.0" % (self.onnx_node.name) - self.set_nodeattr("ip_vlnv", vlnv) - self.code_gen_dict.clear() - def get_normal_input_shape(self, ind=0): depth = self.get_adjusted_depth() assert depth >= 2, """Depth is too low""" @@ -262,154 +155,13 @@ def get_output_datatype(self, ind=0): return DataType[self.get_nodeattr("dataType")] def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") node = self.onnx_node - inp = context[node.input[0]] - exp_shape = self.get_normal_input_shape() - - if mode == "cppsim": - output = inp - output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) - context[node.output[0]] = output - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - # create a npy file for the input of the node - assert ( - str(inp.dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = inp.reshape(expected_inp_shape) - if DataType[self.get_nodeattr("dataType")] == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = DataType[self.get_nodeattr("dataType")] - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - output = self.rtlsim(sim, inp) - odt = DataType[self.get_nodeattr("dataType")] - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) + context[node.output[0]] = context[node.input[0]] def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - - def pragmas(self): - pass - - def code_generation_ipi(self): - impl_style = self.get_nodeattr("impl_style") - if impl_style == "rtl": - return super().code_generation_ipi() - elif impl_style == "vivado": - cmd = [] - node_name = self.onnx_node.name - depth = self.get_adjusted_depth() - ram_style = self.get_nodeattr("ram_style") - # create a hierarchy for this layer, with the same port names - clk_name = self.get_verilog_top_module_intf_names()["clk"][0] - rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] - cmd.append("create_bd_cell -type hier %s" % node_name) - cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) - cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) - cmd.append( - "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) - ) - cmd.append( - "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) - ) - # instantiate and configure DWC - cmd.append( - "create_bd_cell -type ip " - "-vlnv xilinx.com:ip:axis_data_fifo:2.0 /%s/fifo" % node_name - ) - cmd.append( - "set_property -dict [list CONFIG.FIFO_DEPTH {%d}] " - "[get_bd_cells /%s/fifo]" % (depth, node_name) - ) - cmd.append( - "set_property -dict [list CONFIG.FIFO_MEMORY_TYPE {%s}] " - "[get_bd_cells /%s/fifo]" % (ram_style, node_name) - ) - cmd.append( - "set_property -dict [list CONFIG.TDATA_NUM_BYTES {%d}] " - "[get_bd_cells /%s/fifo]" % (np.ceil(self.get_outstream_width() / 8), node_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/fifo/M_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/fifo/S_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] " - "[get_bd_pins %s/fifo/s_axis_aresetn]" % (node_name, rst_name, node_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] " - "[get_bd_pins %s/fifo/s_axis_aclk]" % (node_name, clk_name, node_name) - ) - return cmd - else: - raise Exception( - "FIFO implementation style %s not supported, please use rtl or vivado" % impl_style - ) - def bram_estimation(self): """Calculates resource estimation for BRAM""" impl = self.get_nodeattr("impl_style") @@ -473,10 +225,3 @@ def lut_estimation(self): ram_luts = 0 return int(address_luts + ram_luts) - - def prepare_rtlsim(self): - assert self.get_nodeattr("impl_style") != "vivado", ( - "StreamingFIFO impl_style " - "cannot be vivado for rtlsim. Only impl_style=rtl supported." - ) - super().prepare_rtlsim() diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 4e03e6daf9..3d89a0ab23 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -212,49 +212,3 @@ ipx::save_core [ipx::current_core] ipx::archive_core $Top.zip [ipx::current_core] """ - -strm_fifo_wrapper = """ -module $TOPNAME$( -ap_clk, -ap_rst_n, -count, -maxcount, -in0_$HLS_SNAME$_TDATA, -in0_$HLS_SNAME$_TVALID, -in0_$HLS_SNAME$_TREADY, -out_$HLS_SNAME$_TDATA, -out_$HLS_SNAME$_TVALID, -out_$HLS_SNAME$_TREADY -); - -input ap_clk; -input ap_rst_n; -output $COUNT_RANGE$ count; -output $COUNT_RANGE$ maxcount; -input $IN_RANGE$ in0_$HLS_SNAME$_TDATA; -input in0_$HLS_SNAME$_TVALID; -output in0_$HLS_SNAME$_TREADY; -output $OUT_RANGE$ out_$HLS_SNAME$_TDATA; -output out_$HLS_SNAME$_TVALID; -input out_$HLS_SNAME$_TREADY; - -Q_srl #( -.depth($DEPTH$), -.width($WIDTH$) -) -$LAYER_NAME$ -( - .clock(ap_clk), - .reset(!ap_rst_n), - .count(count), - .maxcount(maxcount), - .i_d(in0_$HLS_SNAME$_TDATA), - .i_v(in0_$HLS_SNAME$_TVALID), - .i_r(in0_$HLS_SNAME$_TREADY), - .o_d(out_$HLS_SNAME$_TDATA), - .o_v(out_$HLS_SNAME$_TVALID), - .o_r(out_$HLS_SNAME$_TREADY) -); - -endmodule -""" diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py index 27417a78e1..ecbf867b69 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fifo.py +++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py @@ -40,6 +40,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers build_dir = os.environ["FINN_BUILD_DIR"] test_fpga_part = "xc7z020clg400-1" @@ -83,7 +84,7 @@ def prepare_inputs(input_tensor, dt): # outWidth @pytest.mark.parametrize("depth", [16]) # finn_dtype -@pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"]]) # , DataType["INT2"]]) +@pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"], DataType["INT2"]]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado @@ -93,6 +94,7 @@ def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype): input_dict = prepare_inputs(x, finn_dtype) model = make_single_fifo_modelwrapper(Shape, depth, folded_shape, finn_dtype) + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) From dddef235f3cfe93623faab864d1a97f304706424 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 14:26:09 +0000 Subject: [PATCH 053/291] [BTS] Infer bts Signed-off-by: aziz bahri --- .../hls/thresholdingbinarysearch_hls.py | 2 +- .../fpgadataflow/thresholding_batch.py | 4 +- .../fpgadataflow/convert_to_hw_layers.py | 118 +++++++ ...fpgadataflow_thresholding_binary_search.py | 287 ++++++++++-------- 4 files changed, 278 insertions(+), 133 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholdingbinarysearch_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholdingbinarysearch_hls.py index a782b21800..97ffc59f7a 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholdingbinarysearch_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholdingbinarysearch_hls.py @@ -58,7 +58,7 @@ class ThresholdingBinarySearch_hls(ThresholdingBinarySearch,HLSBackend): def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) - + def get_nodeattr_types(self): my_attrs = {} my_attrs.update(ThresholdingBinarySearch.get_nodeattr_types(self)) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py index 72ee2f7af6..37c51300a3 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding_batch.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2024, Xilinx # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,6 +31,8 @@ import textwrap import warnings from math import ceil, log2 +from finn.custom_op.fpgadataflow.fmpadding import FMPadding +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from qonnx.core.datatype import DataType from qonnx.util.basic import ( interleave_matrix_outer_dim_from_partitions, diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 2b8433e59c..aacedcc6f2 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -180,7 +180,125 @@ def apply(self, model): model = model.transform(InferDataTypes()) return (model, graph_modified) +class InferThresholdingLayer(Transformation): + """Convert any MultiThreshold into a standalone thresholding HLS layer.""" + def __init__(self, mem_mode="const", use_rtl_variant=False): + super().__init__() + self.mem_mode = mem_mode + self.use_rtl_variant = use_rtl_variant + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "MultiThreshold": + thl_input = node.input[0] + thl_threshold = node.input[1] + thl_output = node.output[0] + thl_in_shape = model.get_tensor_shape(thl_input) + thl_thres_shape = model.get_tensor_shape(thl_threshold) + idt = model.get_tensor_datatype(thl_input) + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # check layout of inputs/outputs, and convert if needed + # check layout and convert if necessary + thl_in_layout = model.get_tensor_layout(thl_input) + if thl_in_layout == DataLayout.NCHW: + thl_input = nchw_to_nhwc(thl_input, model, node_ind) + node_ind += 1 + thl_in_shape = model.get_tensor_shape(thl_input) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + thl_output_layout = model.get_tensor_layout(thl_output) + if thl_output_layout == DataLayout.NCHW: + thl_output = nchw_to_nhwc(thl_output, model, node_ind, reverse=True) + node_ind += 1 + + # now safe to assume number of channels is in last dimension + ifc = int(thl_in_shape[-1]) + # create node with no parallelization first + pe = 1 + + odt = model.get_tensor_datatype(thl_output) + scale = getCustomOp(node).get_nodeattr("out_scale") + assert scale == 1.0, ( + node.name + ": MultiThreshold out_scale must be 1 for HLS conversion." + ) + actval = getCustomOp(node).get_nodeattr("out_bias") + assert int(actval) == actval, ( + node.name + ": MultiThreshold out_bias must be integer for HLS conversion." + ) + actval = int(actval) + assert (not odt.signed()) or (actval < 0), ( + node.name + ": Signed output requires actval < 0" + ) + + # Ensure that RTL variant is not inserted for unsupported configuration + is_rtl_variant_compatible = True + + # Perform checks for RTL variant if chosen + if self.use_rtl_variant and is_rtl_variant_compatible: + new_node = helper.make_node( + "Thresholding_Binary_Search", + [thl_input, thl_threshold], + [thl_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=ifc, + PE=pe, + numSteps=thl_thres_shape[1], + inputDataType=idt.name, + weightDataType=idt.name, + outputDataType=odt.name, + numInputVectors=list(thl_in_shape[:-1]), + activation_bias=actval, + mem_mode=self.mem_mode, + name="Thresholding_Binary_Search_" + node.name, + ) + else: + if self.use_rtl_variant: + warnings.warn( + """%s : RTL Thresholding requested for unsupported + configuration. Falling back to HLS implementation.""" + % node.name + ) + + # create and insert new Thresholding_Batch node + new_node = helper.make_node( + "Thresholding_Batch", + [thl_input, thl_threshold], + [thl_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=ifc, + PE=pe, + numSteps=thl_thres_shape[1], + inputDataType=idt.name, + weightDataType=idt.name, + outputDataType=odt.name, + numInputVectors=list(thl_in_shape[:-1]), + ActVal=actval, + mem_mode=self.mem_mode, + name="Thresholding_Batch_" + node.name, + ) + + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) class InferUpsample(Transformation): """Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour nodes.""" diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index 8e6bf5cbe3..c247e9cdfc 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -37,15 +37,19 @@ from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.util.basic import gen_finn_dt_tensor - +from qonnx.transformation.infer_shapes import InferShapes +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer from finn.core.rtlsim_exec import rtlsim_exec from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +import finn.core.onnx_exec as oxe test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -141,50 +145,50 @@ def make_single_thresholding_binary_search_modelwrapper( return model -# Test brief: Test that PrepareRTLSim() runs successfully. This function is not -# tested in test_fpgadataflow_thresholding_binary_search() -@pytest.mark.fpgadataflow -@pytest.mark.vivado -def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): - input_data_type = DataType["INT16"] - act = DataType["INT4"] - fold = -1 - num_input_channels = 16 - - # Handle inputs to the test - pe = generate_pe_value(fold, num_input_channels) - num_steps = act.get_num_possible_values() - 1 - - # Generate random, non-decreasing thresholds - thresholds = generate_random_threshold_values( - input_data_type, num_input_channels, num_steps - ) - thresholds = sort_thresholds_increasing(thresholds) - - # Other non-input parameters - num_input_vecs = [1, 2, 2] - output_data_type = act - if output_data_type == DataType["BIPOLAR"]: - activation_bias = 0 - else: - activation_bias = output_data_type.min() - - # Generate model from input parameters to the test - model = make_single_thresholding_binary_search_modelwrapper( - thresholds, - pe, - input_data_type, - output_data_type, - activation_bias, - num_input_vecs, - ) - - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) - model = model.transform(HLSSynthIP()) - model = model.transform(PrepareRTLSim()) - return +# # Test brief: Test that PrepareRTLSim() runs successfully. This function is not +# # tested in test_fpgadataflow_thresholding_binary_search() +# @pytest.mark.fpgadataflow +# @pytest.mark.vivado +# def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): +# input_data_type = DataType["INT16"] +# act = DataType["INT4"] +# fold = -1 +# num_input_channels = 16 + +# # Handle inputs to the test +# pe = generate_pe_value(fold, num_input_channels) +# num_steps = act.get_num_possible_values() - 1 + +# # Generate random, non-decreasing thresholds +# thresholds = generate_random_threshold_values( +# input_data_type, num_input_channels, num_steps +# ) +# thresholds = sort_thresholds_increasing(thresholds) + +# # Other non-input parameters +# num_input_vecs = [1, 2, 2] +# output_data_type = act +# if output_data_type == DataType["BIPOLAR"]: +# activation_bias = 0 +# else: +# activation_bias = output_data_type.min() + +# # Generate model from input parameters to the test +# model = make_single_thresholding_binary_search_modelwrapper( +# thresholds, +# pe, +# input_data_type, +# output_data_type, +# activation_bias, +# num_input_vecs, +# ) + +# model = model.transform(SetExecMode("rtlsim")) +# model = model.transform(GiveUniqueNodeNames()) +# model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) +# model = model.transform(HLSSynthIP()) +# model = model.transform(PrepareRTLSim()) +# return # Test brief: Create a Thresholding binary search layer using various parameters @@ -194,11 +198,13 @@ def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) @pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6]) @pytest.mark.parametrize("num_input_channels", [16]) +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +@pytest.mark.parametrize("mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow def test_fpgadataflow_thresholding_binary_search( - activation, input_data_type, fold, num_input_channels + activation, input_data_type, fold, num_input_channels, impl_style, mode ): # Handle inputs to the test pe = generate_pe_value(fold, num_input_channels) @@ -236,88 +242,6 @@ def test_fpgadataflow_thresholding_binary_search( # signed offset y += activation.min() - # Generate model from input parameters to the test - model = make_single_thresholding_binary_search_modelwrapper( - thresholds, - pe, - input_data_type, - output_data_type, - activation_bias, - num_input_vecs, - ) - - model = model.transform(InsertFIFO(True)) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) - model = model.transform(HLSSynthIP()) - model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) - - # Retrieve the axilite programming sequence for weights - for decoupled mode only - tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] - tbs_inst = getCustomOp(tbs_node) - config = tbs_inst.get_dynamic_config(model, 4) - - # Reshape generated data (not from model) - oshape = model.get_tensor_shape("outp") - y_expected = y.reshape(oshape) - - # Helper function that delivers the hook to program the thresholds via AXI-Lite - def config_hook(config): - if config is None: - return None - - def write_thresh_config(sim): - # axi_name = "s_axilite_0_" # works - axi_name = getCustomOp( - model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] - ).get_verilog_top_module_intf_names()["axilite"][0] - axi_name += "_0_" - - # Write config registers to the Threshold memory. - # The dictionary defines (addr, value) tuples. - for config_entry in config.values(): - addr = config_entry[0] - val = config_entry[1] - axilite_write(sim, addr, val, basename=axi_name) - - reset_rtlsim(sim) - - return write_thresh_config - - input_dict = {"inp": x} - rtlsim_exec(model, input_dict, pre_hook=config_hook(config)) - y_produced = input_dict["outp"] - assert (y_produced == y_expected).all() - - -# Test brief: Test basic transforms are working -@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) -@pytest.mark.fpgadataflow -@pytest.mark.vivado -def test_fpgadataflow_thresholding_binary_search_transform(impl_style): - input_data_type = DataType["INT16"] - act = DataType["INT4"] - fold = -1 - num_input_channels = 16 - - # Handle inputs to the test - pe = generate_pe_value(fold, num_input_channels) - num_steps = act.get_num_possible_values() - 1 - - # Generate random, non-decreasing thresholds - thresholds = generate_random_threshold_values( - input_data_type, num_input_channels, num_steps - ) - thresholds = sort_thresholds_increasing(thresholds) - - # Other non-input parameters - num_input_vecs = [1, 2, 2] - output_data_type = act - if output_data_type == DataType["BIPOLAR"]: - activation_bias = 0 - else: - activation_bias = output_data_type.min() - # Generate model from input parameters to the test model = make_single_thresholding_binary_search_modelwrapper( impl_style, @@ -329,10 +253,111 @@ def test_fpgadataflow_thresholding_binary_search_transform(impl_style): num_input_vecs, ) + model = model.transform(InferThresholdingLayer()) model = model.transform(SpecializeLayers()) - # model = model.transform(SetExecMode("rtlsim")) + model = model.transform(InferShapes()) + # model = model.transform(SetExecMode(mode)) + # model = model.transform(GiveUniqueNodeNames()) + # if mode == "cppsim": + # model = model.transform(PrepareCppSim()) + # model = model.transform(CompileCppSim()) + # elif mode == "rtlsim": + # model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + # model = model.transform(HLSSynthIP()) + # model = model.transform(PrepareRTLSim()) + # input_dict = {"inp": x} + # y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + # model = model.transform(InsertFIFO(True)) # model = model.transform(GiveUniqueNodeNames()) # model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) # model = model.transform(HLSSynthIP()) - # model = model.transform(PrepareRTLSim()) - return \ No newline at end of file + # model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + + # # Retrieve the axilite programming sequence for weights - for decoupled mode only + # tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] + # tbs_inst = getCustomOp(tbs_node) + # config = tbs_inst.get_dynamic_config(model, 4) + + # # Reshape generated data (not from model) + # oshape = model.get_tensor_shape("outp") + # y_expected = y.reshape(oshape) + + # # Helper function that delivers the hook to program the thresholds via AXI-Lite + # def config_hook(config): + # if config is None: + # return None + + # def write_thresh_config(sim): + # # axi_name = "s_axilite_0_" # works + # axi_name = getCustomOp( + # model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] + # ).get_verilog_top_module_intf_names()["axilite"][0] + # axi_name += "_0_" + + # # Write config registers to the Threshold memory. + # # The dictionary defines (addr, value) tuples. + # for config_entry in config.values(): + # addr = config_entry[0] + # val = config_entry[1] + # axilite_write(sim, addr, val, basename=axi_name) + + # reset_rtlsim(sim) + + # return write_thresh_config + + # input_dict = {"inp": x} + # rtlsim_exec(model, input_dict, pre_hook=config_hook(config)) + # y_produced = input_dict["outp"] + # assert (y_produced == y_expected).all() + + +# # Test brief: Test basic transforms are working +# @pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +# @pytest.mark.fpgadataflow +# @pytest.mark.vivado +# def test_fpgadataflow_thresholding_binary_search_transform(impl_style): +# input_data_type = DataType["INT16"] +# act = DataType["INT4"] +# fold = -1 +# num_input_channels = 16 + +# # Handle inputs to the test +# pe = generate_pe_value(fold, num_input_channels) +# num_steps = act.get_num_possible_values() - 1 + +# # Generate random, non-decreasing thresholds +# thresholds = generate_random_threshold_values( +# input_data_type, num_input_channels, num_steps +# ) +# thresholds = sort_thresholds_increasing(thresholds) + +# # Other non-input parameters +# num_input_vecs = [1, 2, 2] +# output_data_type = act +# if output_data_type == DataType["BIPOLAR"]: +# activation_bias = 0 +# else: +# activation_bias = output_data_type.min() + +# # Generate model from input parameters to the test +# model = make_single_thresholding_binary_search_modelwrapper( +# impl_style, +# thresholds, +# pe, +# input_data_type, +# output_data_type, +# activation_bias, +# num_input_vecs, +# ) + +# model = model.transform(SpecializeLayers()) + +# # if "hls" in getCustomOp(model.graph.node[0]).__class__.__name__ and impl_style != "hls": + +# # model = model.transform(SetExecMode("rtlsim")) +# # model = model.transform(GiveUniqueNodeNames()) +# # model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) +# # model = model.transform(HLSSynthIP()) +# # model = model.transform(PrepareRTLSim()) +# return \ No newline at end of file From b34b6265e58e2b84a376514d78b18489dacc1e0e Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 14:38:49 +0000 Subject: [PATCH 054/291] [BTS] Rename BTS to Thresholding Signed-off-by: aziz bahri --- src/finn/custom_op/fpgadataflow/__init__.py | 6 +++--- src/finn/custom_op/fpgadataflow/hls/__init__.py | 4 ++-- ...{thresholdingbinarysearch_hls.py => thresholding_hls.py} | 6 +++--- .../{thresholdingbinarysearch.py => thresholding.py} | 2 +- .../test_fpgadataflow_thresholding_binary_search.py | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) rename src/finn/custom_op/fpgadataflow/hls/{thresholdingbinarysearch_hls.py => thresholding_hls.py} (99%) rename src/finn/custom_op/fpgadataflow/{thresholdingbinarysearch.py => thresholding.py} (99%) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 5260f678ef..93c1a4bd1d 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -52,8 +52,8 @@ from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch -from finn.custom_op.fpgadataflow.thresholdingbinarysearch import ( - ThresholdingBinarySearch, +from finn.custom_op.fpgadataflow.thresholding import ( + Thresholding, ) from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour @@ -66,7 +66,7 @@ custom_op["MatrixVectorActivation"] = MatrixVectorActivation custom_op["StreamingFIFO"] = StreamingFIFO custom_op["Thresholding_Batch"] = Thresholding_Batch -custom_op["ThresholdingBinarySearch"] = ThresholdingBinarySearch +custom_op["Thresholding"] = Thresholding custom_op["VectorVectorActivation"] = VectorVectorActivation custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 74d2b982af..87611517f1 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -49,7 +49,7 @@ from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls -from finn.custom_op.fpgadataflow.hls.thresholdingbinarysearch_hls import ThresholdingBinarySearch_hls +from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls custom_op = dict() @@ -72,5 +72,5 @@ custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls -custom_op["ThresholdingBinarySearch_hls"] = ThresholdingBinarySearch_hls +custom_op["Thresholding_hls"] = Thresholding_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholdingbinarysearch_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py similarity index 99% rename from src/finn/custom_op/fpgadataflow/hls/thresholdingbinarysearch_hls.py rename to src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 97ffc59f7a..bb8ca582ea 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholdingbinarysearch_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -31,7 +31,7 @@ import textwrap import warnings from math import ceil, log2 -from finn.custom_op.fpgadataflow.thresholdingbinarysearch import ThresholdingBinarySearch +from finn.custom_op.fpgadataflow.thresholding import Thresholding from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from qonnx.core.datatype import DataType from qonnx.util.basic import ( @@ -53,7 +53,7 @@ # the ... here can be any shape (representing groups of vectors) -class ThresholdingBinarySearch_hls(ThresholdingBinarySearch,HLSBackend): +class Thresholding_hls(Thresholding,HLSBackend): """Class that corresponds to finn-hls Thresholding_Batch function.""" def __init__(self, onnx_node, **kwargs): @@ -61,7 +61,7 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): my_attrs = {} - my_attrs.update(ThresholdingBinarySearch.get_nodeattr_types(self)) + my_attrs.update(Thresholding.get_nodeattr_types(self)) my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs diff --git a/src/finn/custom_op/fpgadataflow/thresholdingbinarysearch.py b/src/finn/custom_op/fpgadataflow/thresholding.py similarity index 99% rename from src/finn/custom_op/fpgadataflow/thresholdingbinarysearch.py rename to src/finn/custom_op/fpgadataflow/thresholding.py index 3d919d3c6e..d6d0d8d01c 100644 --- a/src/finn/custom_op/fpgadataflow/thresholdingbinarysearch.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -33,7 +33,7 @@ from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class ThresholdingBinarySearch(HWCustomOp): +class Thresholding(HWCustomOp): def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py index c247e9cdfc..ea331a4565 100755 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py @@ -112,7 +112,7 @@ def make_single_thresholding_binary_search_modelwrapper( node_inp_list = ["inp", "thresh"] Thresholding_node = helper.make_node( - "ThresholdingBinarySearch", + "Thresholding", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", From 207fa941511f4a705044939ff758caef892f5bbe Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 15:21:17 +0000 Subject: [PATCH 055/291] [TBS] Add HLS variant --- .../fpgadataflow/hls/thresholding_hls.py | 97 +++++++++++++++++-- 1 file changed, 91 insertions(+), 6 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index bb8ca582ea..0ad198feb5 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -39,6 +39,7 @@ roundup_to_integer_multiple, ) +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, @@ -58,11 +59,45 @@ class Thresholding_hls(Thresholding,HLSBackend): def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) - + self.variant = "hls" + def get_nodeattr_types(self): - my_attrs = {} - my_attrs.update(Thresholding.get_nodeattr_types(self)) - my_attrs.update(HLSBackend.get_nodeattr_types(self)) + my_attrs = { + # parallelization; channels thresholded per cycle + "PE": ("i", True, 0), + # number of channels (each may have different thresholds) + "NumChannels": ("i", True, 0), + # number of steps in thresholding function + "numSteps": ("i", True, 1), + # string defining memory type + "ram_style": ("s", False, "distributed", {"distributed", "block"}), + # FINN DataTypes for inputs, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + # initialization value for the thresholding accumulator + "ActVal": ("i", False, 0), + # memory mode for the thresholds + # const -- embedded thresholds, default + # decoupled -- streaming thresholds with streamer packaged inside IP + "mem_mode": ("s", False, "const", {"const", "decoupled"}), + # (mem_mode = decoupled only) whether weights (thresholds) will be + # writable through an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + # see finn-rtllib/memstream/doc/README for more about the memory + # address map used for writable weights + # IMPORTANT: After using AXI lite to either read or write the weights, + # always "flush" the accelerator by first passing a dummy input + # vector through the accelerator. This will get rid of any old + # weight data from the weight FIFOs. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + } + my_attrs.update(super().get_nodeattr_types()) return my_attrs def calc_tmem(self): @@ -71,8 +106,24 @@ def calc_tmem(self): pe = self.get_nodeattr("PE") return mh // pe + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + def infer_node_datatype(self, model): - pass + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype().name), + str(idt.name), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) def verify_node(self): info_messages = [] @@ -464,7 +515,41 @@ def execute_node(self, context, graph): context[node.output[0]] = out oshape = self.get_normal_output_shape() assert context[node.output[0]].shape == oshape, """Output shape is not as expected""" - + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + if self.get_nodeattr("mem_mode") == "decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + elif self.get_nodeattr("mem_mode") == "const": + output = self.rtlsim(sim, inp) + else: + raise Exception("Unrecognized mem_mode") + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} From 09d5d3094b1b056fd6593d8680f8616eae47a9df Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 15:26:35 +0000 Subject: [PATCH 056/291] [TBS] resolve merge conflict --- src/finn/custom_op/fpgadataflow/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 93c1a4bd1d..bd9c0366e7 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -55,7 +55,6 @@ from finn.custom_op.fpgadataflow.thresholding import ( Thresholding, ) -from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation From 7abb066a5d32f4bdcb9a58a207845e960c4e4ad5 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 16:15:53 +0000 Subject: [PATCH 057/291] [TBS] Minimise Thresholding class methods: --- .../custom_op/fpgadataflow/thresholding.py | 26 +------------------ 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index d6d0d8d01c..004bf1aec0 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -77,39 +77,15 @@ def get_nodeattr_types(self): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_padded_odim(): - pass - - def get_exp_cycles(): - pass - - def get_normal_input_shape(): - pass - - def get_normal_output_shape(): - pass - def get_folded_input_shape(): - pass - def get_folded_output_shape(): - pass def make_shape_compatible_op(self, model): oshape = self.get_normal_output_shape() return super().make_const_shape_op(oshape) - def infer_node_datatype(): - pass def verify_node(): pass - def get_input_datatype(): - pass - def get_output_datatype(): - pass - def get_instream_width(): - pass - def get_outstream_width(): + def infer_node_datatype(): pass def get_number_output_values(): pass - def execute_node(self, context, graph): pass From f29d7439eda3448000aa74e5dc76c307b96253ae Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 16:17:00 +0000 Subject: [PATCH 058/291] [TBS] InfeThreshold will only instantiate Thresholding class --- .../fpgadataflow/convert_to_hw_layers.py | 71 +++++-------------- 1 file changed, 18 insertions(+), 53 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index aacedcc6f2..88a9a64cd6 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -183,10 +183,9 @@ def apply(self, model): class InferThresholdingLayer(Transformation): """Convert any MultiThreshold into a standalone thresholding HLS layer.""" - def __init__(self, mem_mode="const", use_rtl_variant=False): + def __init__(self, mem_mode="const"): super().__init__() self.mem_mode = mem_mode - self.use_rtl_variant = use_rtl_variant def apply(self, model): graph = model.graph @@ -241,63 +240,29 @@ def apply(self, model): node.name + ": Signed output requires actval < 0" ) - # Ensure that RTL variant is not inserted for unsupported configuration - is_rtl_variant_compatible = True - - # Perform checks for RTL variant if chosen - if self.use_rtl_variant and is_rtl_variant_compatible: - new_node = helper.make_node( - "Thresholding_Binary_Search", - [thl_input, thl_threshold], - [thl_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=ifc, - PE=pe, - numSteps=thl_thres_shape[1], - inputDataType=idt.name, - weightDataType=idt.name, - outputDataType=odt.name, - numInputVectors=list(thl_in_shape[:-1]), - activation_bias=actval, - mem_mode=self.mem_mode, - name="Thresholding_Binary_Search_" + node.name, - ) - else: - if self.use_rtl_variant: - warnings.warn( - """%s : RTL Thresholding requested for unsupported - configuration. Falling back to HLS implementation.""" - % node.name - ) - - # create and insert new Thresholding_Batch node - new_node = helper.make_node( - "Thresholding_Batch", - [thl_input, thl_threshold], - [thl_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=ifc, - PE=pe, - numSteps=thl_thres_shape[1], - inputDataType=idt.name, - weightDataType=idt.name, - outputDataType=odt.name, - numInputVectors=list(thl_in_shape[:-1]), - ActVal=actval, - mem_mode=self.mem_mode, - name="Thresholding_Batch_" + node.name, - ) + new_node = helper.make_node( + "Thresholding", + [thl_input, thl_threshold], + [thl_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=ifc, + PE=pe, + numSteps=thl_thres_shape[1], + inputDataType=idt.name, + weightDataType=idt.name, + outputDataType=odt.name, + numInputVectors=list(thl_in_shape[:-1]), + ActVal=actval, + mem_mode=self.mem_mode, + name="Thresholding_Batch_" + node.name, + ) graph.node.insert(insert_point, new_node) # remove old node graph.node.remove(node) graph_modified = True - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) return (model, graph_modified) class InferUpsample(Transformation): """Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour nodes.""" From 34005e38a00f3302e7b0d7b784fa52be2abc2679 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 16:18:15 +0000 Subject: [PATCH 059/291] [TBS] Testcase to convert Thresholding layer --- .../test_convert_to_hls_thresholding.py | 109 ++---------------- 1 file changed, 11 insertions(+), 98 deletions(-) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hls_thresholding.py index 9c233bdd06..e96581dc89 100755 --- a/tests/fpgadataflow/test_convert_to_hls_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hls_thresholding.py @@ -49,6 +49,8 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -75,20 +77,6 @@ def generate_pe_value(fold, num_input_channels): return pe -# n = batch, c = channel, h = height, w = width of feature map -# Standard = NCHW; FINN = NHWC -# Convert from NCHW to NHWC -def convert_np_array_to_finn_data_layout(data): - return np.transpose(data, (0, 2, 3, 1)) - - -# n = batch, c = channel, h = height, w = width of feature map -# Standard = NCHW; FINN = NHWC -# Convert from NHWC to NCHW -def convert_np_array_to_standard_data_layout(data): - return np.transpose(data, (0, 3, 1, 2)) - - def make_single_multithresholding_modelwrapper( thresholds, pe, @@ -144,9 +132,11 @@ def make_single_multithresholding_modelwrapper( @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) @pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6]) @pytest.mark.parametrize("num_input_channels", [16]) +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_convert_to_hls_tbs_rtl_variant( +def test_convert_multithreshold_to_hardware( + impl_style, activation, input_data_type, fold, @@ -173,10 +163,6 @@ def test_convert_to_hls_tbs_rtl_variant( else: activation_bias = output_data_type.min() - # generate random input data - tensor_shape = tuple(num_input_vecs + [num_input_channels]) - x = gen_finn_dt_tensor(input_data_type, tensor_shape) - # Generate random thresholds and sort in ascending order thresholds = generate_random_threshold_values( input_data_type, num_input_channels, num_steps @@ -185,73 +171,8 @@ def test_convert_to_hls_tbs_rtl_variant( # provide non-decreasing/ascending thresholds thresholds = sort_thresholds_increasing(thresholds) - x_nhwc = convert_np_array_to_standard_data_layout(x) - y = multithreshold(x_nhwc, thresholds) - - # convert back to NHWC for comparison to hw outputs - y = convert_np_array_to_finn_data_layout(y) - if activation == DataType["BIPOLAR"]: - # binary to bipolar - y = 2 * y - 1 - else: - # signed offset - y += activation.min() - - # Generate model from input parameters to the test - model = make_single_thresholding_binary_search_modelwrapper( - thresholds, - pe, - input_data_type, - output_data_type, - activation_bias, - num_input_vecs, - ) - - model = model.transform(InsertFIFO(True)) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) - model = model.transform(HLSSynthIP()) - model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) - - # Retrieve the axilite programming sequence for weights - for decoupled mode only - tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] - tbs_inst = getCustomOp(tbs_node) - config = tbs_inst.get_dynamic_config(model, 4) - - # Reshape generated data (not from model) - oshape = model.get_tensor_shape("outp") - y_expected = y.reshape(oshape) - - # Helper function that delivers the hook to program the thresholds via AXI-Lite - def config_hook(config): - if config is None: - return None - - def write_thresh_config(sim): - # axi_name = "s_axilite_0_" # works - axi_name = getCustomOp( - model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] - ).get_verilog_top_module_intf_names()["axilite"][0] - axi_name += "_0_" - - # Write config registers to the Threshold memory. - # The dictionary defines (addr, value) tuples. - for config_entry in config.values(): - addr = config_entry[0] - val = config_entry[1] - axilite_write(sim, addr, val, basename=axi_name) - - reset_rtlsim(sim) - - return write_thresh_config - - input_dict = {"inp": x} - rtlsim_exec(model, input_dict, pre_hook=config_hook(config)) - y_produced = input_dict["outp"] - assert (y_produced == y_expected).all() - # Make a Multithreshold graph and convert to thresholding binary search node - new_model = make_single_multithresholding_modelwrapper( + model = make_single_multithresholding_modelwrapper( thresholds, pe, input_data_type, @@ -260,17 +181,9 @@ def write_thresh_config(sim): num_input_vecs, ) - # Recreate the model using the ConvertToHLS transform - new_model = new_model.transform( - to_hls.InferThresholdingLayer(mem_mode="decoupled", use_rtl_variant=True) - ) - new_model = new_model.transform(InsertFIFO(True)) - new_model = new_model.transform(GiveUniqueNodeNames()) - new_model = new_model.transform(PrepareIP(test_fpga_part, target_clk_ns)) - new_model = new_model.transform(HLSSynthIP()) - new_model = new_model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + model = model.transform(InferThresholdingLayer()) + model = model.transform(SpecializeLayers()) + model = model.transform(InferShapes()) - input_dict = {"inp": x} - rtlsim_exec(new_model, input_dict, pre_hook=config_hook(config)) - y_produced_new = input_dict["outp"] - assert (y_produced_new == y_expected).all() + node_variant = getCustomOp(model.graph.node[0]).variant + assert (impl_style == node_variant) \ No newline at end of file From e2c60f0b6b10eee530764f590014048ef605e8ad Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 16:19:30 +0000 Subject: [PATCH 060/291] [TBS] Rename to_hw test --- ..._to_hls_thresholding.py => test_convert_to_hw_thresholding.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/fpgadataflow/{test_convert_to_hls_thresholding.py => test_convert_to_hw_thresholding.py} (100%) diff --git a/tests/fpgadataflow/test_convert_to_hls_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py similarity index 100% rename from tests/fpgadataflow/test_convert_to_hls_thresholding.py rename to tests/fpgadataflow/test_convert_to_hw_thresholding.py From d680a707c48111a3c5a77bbb3acc38f0745db96b Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 16:20:37 +0000 Subject: [PATCH 061/291] [TBS] Remove functional testing for now --- ...fpgadataflow_thresholding_binary_search.py | 363 ------------------ 1 file changed, 363 deletions(-) delete mode 100755 tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py deleted file mode 100755 index ea331a4565..0000000000 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_binary_search.py +++ /dev/null @@ -1,363 +0,0 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -import numpy as np -from onnx import TensorProto, helper -from pyverilator.util.axi_utils import axilite_write, reset_rtlsim -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.general.multithreshold import multithreshold -from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames -from qonnx.util.basic import gen_finn_dt_tensor -from qonnx.transformation.infer_shapes import InferShapes -from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer -from finn.core.rtlsim_exec import rtlsim_exec -from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP -from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim -from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -import finn.core.onnx_exec as oxe - -test_fpga_part = "xczu3eg-sbva484-1-e" -target_clk_ns = 5 - - -# Helper functions -def sort_thresholds_increasing(thresholds): - return np.sort(thresholds, axis=1) - - -def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): - return np.random.randint( - input_data_type.min(), - input_data_type.max() + 1, - (num_input_channels, num_steps), - ).astype(np.float32) - - -def generate_pe_value(fold, num_input_channels): - if fold == -1: - fold = num_input_channels - pe = num_input_channels // fold - assert num_input_channels % pe == 0 - return pe - - -# n = batch, c = channel, h = height, w = width of feature map -# Standard = NCHW; FINN = NHWC -# Convert from NCHW to NHWC -def convert_np_array_to_finn_data_layout(data): - return np.transpose(data, (0, 2, 3, 1)) - - -# n = batch, c = channel, h = height, w = width of feature map -# Standard = NCHW; FINN = NHWC -# Convert from NHWC to NCHW -def convert_np_array_to_standard_data_layout(data): - return np.transpose(data, (0, 3, 1, 2)) - - -def make_single_thresholding_binary_search_modelwrapper( - impl_style, - thresholds, - pe, - input_data_type, - output_data_type, - activation_bias, - num_input_vecs, -): - - NumChannels = thresholds.shape[0] - - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels] - ) - - node_inp_list = ["inp", "thresh"] - - Thresholding_node = helper.make_node( - "Thresholding", - node_inp_list, - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=NumChannels, - PE=pe, - numSteps=thresholds.shape[1], - inputDataType=input_data_type.name, - weightDataType=input_data_type.name, - outputDataType=output_data_type.name, - activation_bias=activation_bias, - numInputVectors=num_input_vecs, - preferred_impl_style=impl_style, - ) - graph = helper.make_graph( - nodes=[Thresholding_node], - name="thresholding_graph", - inputs=[inp], - outputs=[outp], - ) - - model = helper.make_model(graph, producer_name="thresholding-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", input_data_type) - model.set_tensor_datatype("outp", output_data_type) - - model.set_tensor_datatype("thresh", input_data_type) - model.set_initializer("thresh", thresholds) - return model - - -# # Test brief: Test that PrepareRTLSim() runs successfully. This function is not -# # tested in test_fpgadataflow_thresholding_binary_search() -# @pytest.mark.fpgadataflow -# @pytest.mark.vivado -# def test_fpgadataflow_thresholding_binary_search_prepare_rtlsim(): -# input_data_type = DataType["INT16"] -# act = DataType["INT4"] -# fold = -1 -# num_input_channels = 16 - -# # Handle inputs to the test -# pe = generate_pe_value(fold, num_input_channels) -# num_steps = act.get_num_possible_values() - 1 - -# # Generate random, non-decreasing thresholds -# thresholds = generate_random_threshold_values( -# input_data_type, num_input_channels, num_steps -# ) -# thresholds = sort_thresholds_increasing(thresholds) - -# # Other non-input parameters -# num_input_vecs = [1, 2, 2] -# output_data_type = act -# if output_data_type == DataType["BIPOLAR"]: -# activation_bias = 0 -# else: -# activation_bias = output_data_type.min() - -# # Generate model from input parameters to the test -# model = make_single_thresholding_binary_search_modelwrapper( -# thresholds, -# pe, -# input_data_type, -# output_data_type, -# activation_bias, -# num_input_vecs, -# ) - -# model = model.transform(SetExecMode("rtlsim")) -# model = model.transform(GiveUniqueNodeNames()) -# model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) -# model = model.transform(HLSSynthIP()) -# model = model.transform(PrepareRTLSim()) -# return - - -# Test brief: Create a Thresholding binary search layer using various parameters -# and test against a SW generated & simulated dataset -# N.B. Fold values where C % PE != 0 fail -@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) -@pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) -@pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6]) -@pytest.mark.parametrize("num_input_channels", [16]) -@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) -@pytest.mark.parametrize("mode", ["cppsim", "rtlsim"]) -@pytest.mark.fpgadataflow -@pytest.mark.vivado -@pytest.mark.slow -def test_fpgadataflow_thresholding_binary_search( - activation, input_data_type, fold, num_input_channels, impl_style, mode -): - # Handle inputs to the test - pe = generate_pe_value(fold, num_input_channels) - num_steps = activation.get_num_possible_values() - 1 - - # Other non-input parameters - num_input_vecs = [1, 2, 2] - output_data_type = activation - if output_data_type == DataType["BIPOLAR"]: - activation_bias = 0 - else: - activation_bias = output_data_type.min() - - # generate random input data - tensor_shape = tuple(num_input_vecs + [num_input_channels]) - x = gen_finn_dt_tensor(input_data_type, tensor_shape) - - # Generate random thresholds and sort in ascending order - thresholds = generate_random_threshold_values( - input_data_type, num_input_channels, num_steps - ) - - # provide non-decreasing/ascending thresholds - thresholds = sort_thresholds_increasing(thresholds) - - x_nhwc = convert_np_array_to_standard_data_layout(x) - y = multithreshold(x_nhwc, thresholds) - - # convert back to NHWC for comparison to hw outputs - y = convert_np_array_to_finn_data_layout(y) - if activation == DataType["BIPOLAR"]: - # binary to bipolar - y = 2 * y - 1 - else: - # signed offset - y += activation.min() - - # Generate model from input parameters to the test - model = make_single_thresholding_binary_search_modelwrapper( - impl_style, - thresholds, - pe, - input_data_type, - output_data_type, - activation_bias, - num_input_vecs, - ) - - model = model.transform(InferThresholdingLayer()) - model = model.transform(SpecializeLayers()) - model = model.transform(InferShapes()) - # model = model.transform(SetExecMode(mode)) - # model = model.transform(GiveUniqueNodeNames()) - # if mode == "cppsim": - # model = model.transform(PrepareCppSim()) - # model = model.transform(CompileCppSim()) - # elif mode == "rtlsim": - # model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) - # model = model.transform(HLSSynthIP()) - # model = model.transform(PrepareRTLSim()) - # input_dict = {"inp": x} - # y_produced = oxe.execute_onnx(model, input_dict)["outp"] - - # model = model.transform(InsertFIFO(True)) - # model = model.transform(GiveUniqueNodeNames()) - # model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) - # model = model.transform(HLSSynthIP()) - # model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) - - # # Retrieve the axilite programming sequence for weights - for decoupled mode only - # tbs_node = model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] - # tbs_inst = getCustomOp(tbs_node) - # config = tbs_inst.get_dynamic_config(model, 4) - - # # Reshape generated data (not from model) - # oshape = model.get_tensor_shape("outp") - # y_expected = y.reshape(oshape) - - # # Helper function that delivers the hook to program the thresholds via AXI-Lite - # def config_hook(config): - # if config is None: - # return None - - # def write_thresh_config(sim): - # # axi_name = "s_axilite_0_" # works - # axi_name = getCustomOp( - # model.get_nodes_by_op_type("Thresholding_Binary_Search")[0] - # ).get_verilog_top_module_intf_names()["axilite"][0] - # axi_name += "_0_" - - # # Write config registers to the Threshold memory. - # # The dictionary defines (addr, value) tuples. - # for config_entry in config.values(): - # addr = config_entry[0] - # val = config_entry[1] - # axilite_write(sim, addr, val, basename=axi_name) - - # reset_rtlsim(sim) - - # return write_thresh_config - - # input_dict = {"inp": x} - # rtlsim_exec(model, input_dict, pre_hook=config_hook(config)) - # y_produced = input_dict["outp"] - # assert (y_produced == y_expected).all() - - -# # Test brief: Test basic transforms are working -# @pytest.mark.parametrize("impl_style", ["rtl", "hls"]) -# @pytest.mark.fpgadataflow -# @pytest.mark.vivado -# def test_fpgadataflow_thresholding_binary_search_transform(impl_style): -# input_data_type = DataType["INT16"] -# act = DataType["INT4"] -# fold = -1 -# num_input_channels = 16 - -# # Handle inputs to the test -# pe = generate_pe_value(fold, num_input_channels) -# num_steps = act.get_num_possible_values() - 1 - -# # Generate random, non-decreasing thresholds -# thresholds = generate_random_threshold_values( -# input_data_type, num_input_channels, num_steps -# ) -# thresholds = sort_thresholds_increasing(thresholds) - -# # Other non-input parameters -# num_input_vecs = [1, 2, 2] -# output_data_type = act -# if output_data_type == DataType["BIPOLAR"]: -# activation_bias = 0 -# else: -# activation_bias = output_data_type.min() - -# # Generate model from input parameters to the test -# model = make_single_thresholding_binary_search_modelwrapper( -# impl_style, -# thresholds, -# pe, -# input_data_type, -# output_data_type, -# activation_bias, -# num_input_vecs, -# ) - -# model = model.transform(SpecializeLayers()) - -# # if "hls" in getCustomOp(model.graph.node[0]).__class__.__name__ and impl_style != "hls": - -# # model = model.transform(SetExecMode("rtlsim")) -# # model = model.transform(GiveUniqueNodeNames()) -# # model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) -# # model = model.transform(HLSSynthIP()) -# # model = model.transform(PrepareRTLSim()) -# return \ No newline at end of file From 9bc48a6cc269c8aaaba6dc1605e85f922f992607 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 16:23:05 +0000 Subject: [PATCH 062/291] [TBS] remove unused thresholding scripts --- .../fpgadataflow/thresholding_batch.py | 940 ------------------ .../thresholding_binary_search.py | 766 -------------- 2 files changed, 1706 deletions(-) delete mode 100644 src/finn/custom_op/fpgadataflow/thresholding_batch.py delete mode 100755 src/finn/custom_op/fpgadataflow/thresholding_binary_search.py diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/thresholding_batch.py deleted file mode 100644 index 37c51300a3..0000000000 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ /dev/null @@ -1,940 +0,0 @@ -# Copyright (c) 2024, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import textwrap -import warnings -from math import ceil, log2 -from finn.custom_op.fpgadataflow.fmpadding import FMPadding -from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend -from qonnx.core.datatype import DataType -from qonnx.util.basic import ( - interleave_matrix_outer_dim_from_partitions, - roundup_to_integer_multiple, -) - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) - -# ONNX i/o tensor shape assumptions for Thresholding: -# input 0 is the input tensor, shape (..., NumChannels) -# input 1 is the threshold tensor, shape (NumChannels, n_thres) -# output 0 is the output tensor, shape (..., NumChannels) - same as input -# the ... here can be any shape (representing groups of vectors) - - -class Thresholding_Batch(HLSCustomOp): - """Class that corresponds to finn-hls Thresholding_Batch function.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = { - # parallelization; channels thresholded per cycle - "PE": ("i", True, 0), - # number of channels (each may have different thresholds) - "NumChannels": ("i", True, 0), - # number of steps in thresholding function - "numSteps": ("i", True, 1), - # string defining memory type - "ram_style": ("s", False, "distributed", {"distributed", "block"}), - # FINN DataTypes for inputs, outputs - "inputDataType": ("s", True, ""), - "weightDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - # initialization value for the thresholding accumulator - "ActVal": ("i", False, 0), - # memory mode for the thresholds - # const -- embedded thresholds, default - # decoupled -- streaming thresholds with streamer packaged inside IP - "mem_mode": ("s", False, "const", {"const", "decoupled"}), - # (mem_mode = decoupled only) whether weights (thresholds) will be - # writable through an AXI-lite interface during runtime - # 1 for enabled, 0 for disabled. - # see finn-rtllib/memstream/doc/README for more about the memory - # address map used for writable weights - # IMPORTANT: After using AXI lite to either read or write the weights, - # always "flush" the accelerator by first passing a dummy input - # vector through the accelerator. This will get rid of any old - # weight data from the weight FIFOs. - "runtime_writeable_weights": ("i", False, 0, {0, 1}), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def calc_tmem(self): - """Calculates and returns TMEM.""" - mh = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - return mh // pe - - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype().name), - str(idt.name), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - # TODO collect automatically from get_nodeattr_types - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("NumChannels") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType") - self.get_nodeattr("outputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append("""The required Threshold_Batch attributes do not exist.""") - - return info_messages - - def bram_estimation(self): - """Calculates BRAM cost if resource set to BRAM""" - style = self.get_nodeattr("ram_style") - P = self.get_nodeattr("PE") - idt = self.get_input_datatype() - A = idt.bitwidth() - tmem = self.calc_tmem() - - if style == "block" and tmem > 1: - return int(ceil(A * P / 16)) * int(ceil(tmem / 1024)) - else: - return 0 - - def lut_estimation(self): - """Calculates LUT cost, taking memory resource type into account""" - # TODO add in/out FIFO contributions - style = self.get_nodeattr("ram_style") - P = self.get_nodeattr("PE") - idt = self.get_input_datatype() - A = idt.bitwidth() - tmem = self.calc_tmem() - # cost of comparators - comparator_cost = A * P - # cost of LUTRAM - if style == "distributed" and tmem > 1: - lutram_cost = P * A * int(ceil(tmem / 64)) - else: - lutram_cost = 0 - # total cost - return comparator_cost + lutram_cost - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_weight_datatype(self): - """Returns FINN DataType of thresholds, here called weights.""" - return DataType[self.get_nodeattr("weightDataType")] - - def minimize_accumulator_width(self, model): - "Minimize threshold width ('accumulator width' here due to convention)" - thresholds = model.get_initializer(self.onnx_node.input[1]) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - min_input = self.get_input_datatype().min() - max_input = self.get_input_datatype().max() - # get range required by threshold values - tdt_min = min(min_input, min_threshold) - tdt_max = max(max_input, max_threshold) - if tdt_min < 0: - if abs(tdt_min) > tdt_max: - tdt = DataType.get_smallest_possible(tdt_min) - else: - tdt = DataType.get_smallest_possible(-tdt_max - 1) - else: - tdt = DataType.get_smallest_possible(tdt_max) - assert np.vectorize(tdt.allowed)( - threshold_tensor - ).all(), "Thresholds can't be expressed with type %s" % str(tdt) - self.set_nodeattr("weightDataType", tdt.name) - # Update QONNX DataType of tensor for consistency - model.set_tensor_datatype(self.onnx_node.input[1], tdt) - return DataType[self.get_nodeattr("weightDataType")] - - def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - return i_bits * self.get_nodeattr("PE") - - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - return o_bits * self.get_nodeattr("PE") - - def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if self.get_nodeattr("mem_mode") == "decoupled": - pe = self.get_nodeattr("PE") - wp = self.get_weight_datatype().bitwidth() - n_thres_steps = self.get_nodeattr("numSteps") - w_width = pe * wp * n_thres_steps - return w_width - else: - return 0 - - def get_weightstream_width_padded(self): - """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" - weight_width = self.get_weightstream_width() - return roundup_to_integer_multiple(weight_width, 8) - - def get_ap_int_max_w(self): - temp_value = super().get_ap_int_max_w() - weightstream = self.get_weightstream_width() - return max([weightstream, temp_value]) - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - fold = ich // pe - vecs = list(self.get_nodeattr("numInputVectors")) - folded_input_shape = tuple(vecs + [fold, pe]) - return folded_input_shape - - def get_folded_output_shape(self, ind=0): - # same shape as input - return self.get_folded_input_shape() - - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_input_shape = tuple(vecs + [ich]) - return normal_input_shape - - def get_normal_output_shape(self, ind=0): - # same shape as input - return self.get_normal_input_shape() - - def get_number_output_values(self): - nf = np.prod(self.get_folded_output_shape()[:-1]) - return nf - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - # fill in TSrcI - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str - - return ret - - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 - * for unsigned inputs, ensure thresholds are positive - * interleave rows between PEs - * reshape into (PE, TMEM, n_thres_steps) and return - """ - mh = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - tmem = mh // pe - assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." - assert ( - orig_thres_matrix.ndim == 2 - ), """Threshold matrix dimension is - not as expected (2).""" - n_thres_steps = orig_thres_matrix.shape[1] - assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" - if not self.get_input_datatype().signed(): - # ensure all thresholds are nonnegative - assert (orig_thres_matrix >= 0).all() - # ensure all thresholds are integer - assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" - ret = orig_thres_matrix - # ensure channels = mh , duplicating if necessary - if ret.shape[0] == 1: - ret = np.tile(ret, (mh, 1)) - assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - assert ( - ret.shape[0] == pe - ), """First dimension after distribution of the - rows between PEs is not as expected (pe)""" - assert ( - ret.shape[1] == tmem - ), """Second dimension after distribution of the - rows between PEs is not as expected (tmem)""" - assert ( - ret.shape[2] == n_thres_steps - ), """Third dimension after distribution of the - rows between PEs is not as expected (n_thres_steps)""" - return ret.reshape(1, pe, tmem, n_thres_steps) - - def make_weight_file(self, weights, weight_file_mode, weight_file_name): - """Produce a file containing given weights (thresholds) in appropriate - format for this layer. This file can be used for either synthesis or - run-time reconfig of weights. - - Arguments: - - * weights : numpy array with weights to be put into the file - * weight_file_mode : one of {hls_header, decoupled_verilog_dat, - decoupled_runtime} - * weight_file_name : filename for the weight file to be generated - - """ - threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) - tdt = self.get_weight_datatype() - assert np.vectorize(tdt.allowed)( - threshold_tensor - ).all(), "Thresholds can't be expressed with type %s" % str(tdt) - if weight_file_mode == "hls_header": - # save thresholds in thresh.h - thresholds_hls_code = numpy_to_hls_code( - threshold_tensor, tdt, "thresholds", False, True - ) - # write thresholds into thresh.h - f_thresh = open(weight_file_name, "w") - tdt_hls = tdt.get_hls_datatype_str() - # use binary to export bipolar activations - export_odt = self.get_output_datatype() - if self.get_output_datatype() == DataType["BIPOLAR"]: - export_odt = DataType["BINARY"] - odt_hls = export_odt.get_hls_datatype_str() - f_thresh.write( - "static ThresholdsActivation<{},{},{},{},{},{},{}> threshs \ - = ".format( - self.calc_tmem(), - self.get_nodeattr("PE"), - threshold_tensor.shape[-1], - tdt_hls, - odt_hls, - self.get_nodeattr("ActVal"), - "comp::less_equal<%s, %s>" % (tdt_hls, tdt_hls), - ) - ) - f_thresh.write(thresholds_hls_code) - f_thresh.close() - elif "decoupled" in weight_file_mode: - # streaming thresholds need to be organized differently - # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps) - decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3)) - # TODO add flips/reversals as needed here - # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps) - pe = self.get_nodeattr("PE") - n_thres_steps = self.get_nodeattr("numSteps") - decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2) - decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps) - decoupled_thres = decoupled_thres.copy() - decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape( - 1, -1, pe * n_thres_steps - ) - decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy() - - if weight_file_mode == "decoupled_npy": - # save weight stream into npy for cppsim - np.save(weight_file_name, decoupled_thres) - elif weight_file_mode == "decoupled_verilog_dat": - # convert weight values into hexstring - weight_width = self.get_weightstream_width() - # pad to nearest 4 bits to get hex strings - weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" - ) - weight_stream = weight_tensor_pe_flipped.flatten() - weight_stream = weight_stream.copy() - with open(weight_file_name, "w") as f: - for val in weight_stream: - f.write(val + "\n") - elif weight_file_mode == "decoupled_runtime": - # memstream axi-lite interface will map each mem line to - # one or multiple 32-bit words - weight_width = self.get_weightstream_width() - words_per_memwidth = 2 ** ceil(log2(weight_width / 32)) - if words_per_memwidth < 1: - words_per_memwidth = 1 - weight_width_padded = words_per_memwidth * 32 - # first, pack and ensure padding to 32 bits - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" - ) - weight_stream = weight_tensor_pe_flipped.flatten() - weight_stream = weight_stream.copy() - with open(weight_file_name, "w") as f: - for val in weight_stream: - # split into groups of 8 hex digits (= 32 bits) - words_32b = textwrap.wrap(val, 8) - words_32b.reverse() - for word_32b in words_32b: - f.write(word_32b + "\n") - else: - raise Exception("Decoupled weight export not yet implemented") - else: - raise Exception("Unknown weight_file_mode") - - def generate_params(self, model, path): - code_gen_dir = path - thresholds = model.get_initializer(self.onnx_node.input[1]) - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - # save thresholds in thresh.h - weight_filename = "{}/thresh.h".format(code_gen_dir) - self.make_weight_file(thresholds, "hls_header", weight_filename) - elif mem_mode == "decoupled": - # save decoupled weights for cppsim - weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) - self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim) - # also save weights as Verilog .dat file - # This file will be ignored when synthesizing UltraScale memory. - weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) - self.make_weight_file(thresholds, "decoupled_verilog_dat", weight_filename_rtl) - else: - raise Exception("Unrecognized mem_mode") - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for Thresholding_Batch") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - # reinterpret binary output as bipolar where needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - oshape = self.get_normal_output_shape() - assert context[node.output[0]].shape == oshape, """Output shape is not as expected""" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - if self.get_nodeattr("mem_mode") == "decoupled": - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - wei = npy_to_rtlsim_input( - "{}/thresholds.npy".format(code_gen_dir), export_wdt, wnbits - ) - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - elif self.get_nodeattr("mem_mode") == "const": - output = self.rtlsim(sim, inp) - else: - raise Exception("Unrecognized mem_mode") - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] - if self.get_nodeattr("mem_mode") == "const": - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] - - # TODO check and add whatever missing - def defines(self, var): - numReps = 1 - numInputVectors = list(self.get_nodeattr("numInputVectors")) - total_spatial_size = int(np.prod(numInputVectors)) - - self.code_gen_dict["$DEFINES$"] = [ - """#define NumChannels1 {}\n #define PE1 {}\n #define numReps {}\n - #define ImgDim1 {}""".format( - self.get_nodeattr("NumChannels"), - self.get_nodeattr("PE"), - numReps, - total_spatial_size, - ) - ] - if self.get_nodeattr("mem_mode") == "decoupled": - self.code_gen_dict["$DEFINES$"].append( - "#define ActVal1 %d" % self.get_nodeattr("ActVal") - ) - self.code_gen_dict["$DEFINES$"].append( - "#define ThresType1 %s" % self.get_weight_datatype().get_hls_datatype_str() - ) - self.code_gen_dict["$DEFINES$"].append( - "#define NumSteps1 %d" % self.get_nodeattr("numSteps") - ) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": - tdt = self.get_weight_datatype() - elem_bits = tdt.bitwidth() - packed_bits = self.get_weightstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = tdt.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/thresholds.npy" % code_gen_dir - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, ImgDim1);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights_{} ("weights_{}");'.format( - self.get_weightstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - def docompute(self): - tmpl_args = self.get_template_param_values() - node = self.onnx_node - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} - (in0_{}, out_{}, threshs, numReps);""".format( - node.op_type, - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - self.hls_sname(), - self.hls_sname(), - ) - ] - elif mem_mode == "decoupled": - # note that numReps is set to 1 in the invocation below, since - # - for cppsim the repetition comes from the threshold stream reader+input - # - for synth the unit runs continuously anyway (ap_ctrl_none) - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} - (in0_{}, out_{}, weights_{}, numReps);""".format( - "Thresholding_Stream_Batch", - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - self.hls_sname(), - self.hls_sname(), - self.hls_sname(), - ) - ] - else: - raise Exception("Unrecognized mem_mode") - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - if self.get_nodeattr("mem_mode") == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - elif self.get_nodeattr("mem_mode") == "decoupled": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &weights_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_weightstream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - else: - raise Exception("Unrecognized mem_mode") - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - if self.get_nodeattr("mem_mode") == "const": - # the threshold tensor is acc_type [PE][TMEM][N_THRES] - # partition for parallel access along PE and N_THRES - # dimensions (dims 1 and 3) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") - ) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") - ) - # set resource type - ram_style = self.get_nodeattr("ram_style") - pe = self.get_nodeattr("PE") - ich = self.get_nodeattr("NumChannels") - # if PE less than NumChannels, assign cores according to ram_style; - # otherwise if PE == NumChannels, Vivado HLS will unroll to FFs - if pe < ich: - if ram_style == "distributed": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") - ) - elif ram_style == "block": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") - ) - else: - raise Exception( - """Invalid value for attribute ram_style! Is currently set to: {} - has to be set to one of ("block", "distributed")""".format( - ram_style - ) - ) - elif self.get_nodeattr("mem_mode") == "decoupled": - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() - ) - - def code_generation_ipi(self): - cmd = [] - # add streamer if needed - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": - node_name = self.onnx_node.name - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - sname = self.hls_sname() - # create a hierarchy for this layer, with the same port names - clk_name = self.get_verilog_top_module_intf_names()["clk"][0] - rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] - cmd.append("create_bd_cell -type hier %s" % node_name) - cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) - cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) - cmd.append( - "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) - ) - cmd.append( - "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) - ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) - # instantiate a streamer and connect it to the HLS IP - strm_vlnv = "amd.com:finn:memstream:1.0" - strm_inst = node_name + "_wstrm" - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst) - ) - cmd.append( - "set_property -dict [list " - "CONFIG.DEPTH {%d} " - "CONFIG.WIDTH {%d} " - "CONFIG.INIT_FILE {%s} " - "CONFIG.RAM_STYLE {%s} " - "] [get_bd_cells /%s/%s]" - % ( - self.calc_tmem(), - self.get_weightstream_width_padded(), - self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat", - self.get_nodeattr("ram_style"), - node_name, - strm_inst, - ) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] " - "[get_bd_intf_pins %s/%s/weights_%s]" - % (node_name, strm_inst, node_name, node_name, sname) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]" - % (node_name, rst_name, node_name, strm_inst) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]" - % (node_name, clk_name, node_name, strm_inst) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" - % (node_name, rst_name, node_name, node_name, rst_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]" - % (node_name, clk_name, node_name, node_name, clk_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/%s] " - "[get_bd_intf_pins %s/%s/%s]" - % (node_name, din_name, node_name, node_name, din_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/%s] " - "[get_bd_intf_pins %s/%s/%s]" - % (node_name, dout_name, node_name, node_name, dout_name) - ) - if runtime_writable: - # expose axi lite interface for writeable weights - axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] - cmd.append( - "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/%s] " - "[get_bd_intf_pins %s/%s/%s]" - % (node_name, axilite_name, node_name, strm_inst, axilite_name) - ) - # TODO calculate and pass in segment size here - cmd.append("assign_bd_address") - cmd.append("save_bd_design") - elif mem_mode == "const": - # base class impl sufficient for const mode - return super().code_generation_ipi() - else: - raise Exception("Unrecognized mem_mode for Thresholding_Batch") - return cmd - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names - - def get_op_and_param_counts(self): - ret_dict = {} - weight_bits = self.get_weight_datatype().bitwidth() - out_features = self.get_nodeattr("NumChannels") - num_steps = self.get_nodeattr("numSteps") - # thresholds are called weights in this layer - thres_param_type = "param_threshold_%db" % (weight_bits) - thres_count = out_features * num_steps - ret_dict[thres_param_type] = thres_count - return ret_dict - - def ipgen_extra_directives(self): - "Return a list of extra tcl directives for HLS synthesis." - - return ["config_compile -pipeline_style frp"] - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_tmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py b/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py deleted file mode 100755 index cde0d8dc79..0000000000 --- a/src/finn/custom_op/fpgadataflow/thresholding_binary_search.py +++ /dev/null @@ -1,766 +0,0 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import math -import numpy as np -import os -import shutil -import warnings -from pyverilator.util.axi_utils import rtlsim_multi_io -from qonnx.core.datatype import DataType -from qonnx.util.basic import ( - interleave_matrix_outer_dim_from_partitions, - roundup_to_integer_multiple, -) - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.basic import ( - find_next_power_of_2, - get_memutil_alternatives, - get_rtlsim_trace_depth, - make_build_dir, - mem_primitives_versal, - pyverilate_get_liveness_threshold_cycles, -) -from finn.util.data_packing import ( - npy_to_rtlsim_input, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) - -try: - from pyverilator import PyVerilator -except ModuleNotFoundError: - PyVerilator = None - -"""@package thresholding_binary_search -- ONNX i/o tensor shape assumptions for Thresholding: -- input 0 is the input tensor, shape (..., NumChannels) -- input 1 is the threshold tensor, shape (NumChannels, n_thres) -- output 0 is the output tensor, shape (..., NumChannels) - same as input -- the '...' here can be any shape (representing groups of vectors) - -This module creates an RTL IP, HLS is not supported. See 'thresholding_batch' -for a HLS equivalent. -""" - - -class Thresholding_Binary_Search(HLSCustomOp): - """Class that corresponds to finn-rtllib 'thresholding' function.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - def get_nodeattr_types(self): - my_attrs = { - # parallelization; channels thresholded per cycle - "PE": ("i", True, 0), - # number of channels (each may have different thresholds) - "NumChannels": ("i", True, 0), - # number of steps in thresholding function. Used only in decoupled mode - "numSteps": ("i", True, 1), - # FINN DataTypes for inputs, outputs - "inputDataType": ("s", True, ""), - "weightDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - # name of the top module in verilog template. Used by PyVerilator - # and IPI generation - "gen_top_module": ("s", False, ""), - # bias to be applied to outputs of the node - "activation_bias": ("i", False, 0), - # whether weights (thresholds) will be - # writable through an AXI-lite interface during runtime - # 1 for enabled, 0 for disabled. - "runtime_writeable_weights": ("i", False, 0, {0, 1}), - # memory depth triggers for threshold storage - "depth_trigger_uram": ("i", False, 0), - "depth_trigger_bram": ("i", False, 0), - # enable uniform thres optimization - # doesn't actually do anything yet, only - # for resource estimations - "uniform_thres": ("i", False, 0, {0, 1}), - # enable deep pipelining for easier timing closure - # setting to 0 may save some FFs but otherwise leave on - "deep_pipeline": ("i", False, 1, {0, 1}), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_pe_mem_geometries(self): - pe = self.get_nodeattr("PE") - wdt = self.get_weight_datatype() - wdt_bits = wdt.bitwidth() - odt = self.get_output_datatype() - odt_bits = odt.bitwidth() - t_channels = self.get_nodeattr("NumChannels") - cf = t_channels / pe - is_uniform = self.get_nodeattr("uniform_thres") - if is_uniform: - ret = [(odt_bits - x, cf * (2**x)) for x in range(1, odt_bits)] - else: - ret = [(wdt_bits, (cf) * 2**x) for x in range(odt_bits)] - return ret - - def get_memory_estimate(self): - res_dict = {} - depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") - depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") - pe = self.get_nodeattr("PE") - ret = self.get_pe_mem_geometries() - for mem_cfg in ret: - (width, depth) = mem_cfg - primitives = mem_primitives_versal - if depth_trigger_bram != 0 or depth_trigger_uram != 0: - if depth >= depth_trigger_bram and depth < depth_trigger_uram: - primitives = {k: v for (k, v) in mem_primitives_versal.items() if "BRAM" in k} - elif depth >= depth_trigger_uram: - primitives = {k: v for (k, v) in mem_primitives_versal.items() if "URAM" in k} - alts = get_memutil_alternatives(mem_cfg, primitives) - primary_alt = alts[0] - res_type = primary_alt[0].split("_")[0] - res_count, eff, waste = primary_alt[1] - res_dict[res_type] = res_dict.get(res_type, 0) + pe * res_count - return res_dict - - def calc_tmem(self): - """Calculates and returns TMEM.""" - num_channels = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - return num_channels // pe - - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - """Used for FINN DataType inference: set the output tensors' datatypes - accordingly for this node""" - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype().name), - str(idt.name), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) - - def verify_node(self): - """Required by the FINN nalysis module. Checks if custom ops in graph - are correctly built, with all attributes and inputs.""" - return [] - - def bram_estimation(self): - res_dict = self.get_memory_estimate() - return res_dict.get("BRAM", 0) - - def uram_estimation(self): - res_dict = self.get_memory_estimate() - return res_dict.get("URAM", 0) - - def lut_estimation(self): - res_dict = self.get_memory_estimate() - return res_dict.get("LUTRAM", 0) - - def get_input_datatype(self, ind=0): - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - return DataType[self.get_nodeattr("outputDataType")] - - def get_weight_datatype(self): - """The term 'weights' and 'thresholds' are used interchangably in this class.""" - return DataType[self.get_nodeattr("weightDataType")] - - def minimize_accumulator_width(self, model): - "Minimize threshold width ('accumulator width' here due to convention)" - thresholds = model.get_initializer(self.onnx_node.input[1]) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - min_input = self.get_input_datatype().min() - max_input = self.get_input_datatype().max() - # get range required by threshold values - tdt_min = min(min_input, min_threshold) - tdt_max = max(max_input, max_threshold) - if tdt_min < 0: - if abs(tdt_min) > tdt_max: - tdt = DataType.get_smallest_possible(tdt_min) - else: - tdt = DataType.get_smallest_possible(-tdt_max - 1) - else: - tdt = DataType.get_smallest_possible(tdt_max) - assert np.vectorize(tdt.allowed)( - threshold_tensor - ).all(), "Thresholds can't be expressed with type %s" % str(tdt) - self.set_nodeattr("weightDataType", tdt.name) - return DataType[self.get_nodeattr("weightDataType")] - - def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - return i_bits * self.get_nodeattr("PE") - - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - return o_bits * self.get_nodeattr("PE") - - def get_weightstream_width(self): - """Returns weight stream width""" - pe = self.get_nodeattr("PE") - wp = self.get_weight_datatype().bitwidth() - n_thres_steps = self.get_nodeattr("numSteps") - w_width = pe * wp * n_thres_steps - return w_width - - def get_folded_input_shape(self, ind=0): - fold = self.calc_tmem() - pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) - folded_input_shape = tuple(vecs + [fold, pe]) - return folded_input_shape - - def get_folded_output_shape(self, ind=0): - # same shape as input - return self.get_folded_input_shape() - - def get_normal_input_shape(self, ind=0): - num_channels = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_input_shape = tuple(vecs + [num_channels]) - return normal_input_shape - - def get_normal_output_shape(self, ind=0): - # same shape as input - return self.get_normal_input_shape() - - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 - * for unsigned inputs, ensure thresholds are positive - * interleave rows between PEs - * reshape into (PE, TMEM, n_thres_steps) and return - """ - mh = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - tmem = mh // pe - assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." - assert ( - orig_thres_matrix.ndim == 2 - ), """Threshold matrix dimension is - not as expected (2).""" - n_thres_steps = orig_thres_matrix.shape[1] - assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" - if not self.get_input_datatype().signed(): - # ensure all thresholds are nonnegative - assert (orig_thres_matrix >= 0).all() - # ensure all thresholds are integer - assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" - ret = orig_thres_matrix - # ensure channels = mh , duplicating if necessary - if ret.shape[0] == 1: - ret = np.tile(ret, (mh, 1)) - assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - assert ( - ret.shape[0] == pe - ), """First dimension after distribution of the - rows between PEs is not as expected (pe)""" - assert ( - ret.shape[1] == tmem - ), """Second dimension after distribution of the - rows between PEs is not as expected (tmem)""" - assert ( - ret.shape[2] == n_thres_steps - ), """Third dimension after distribution of the - rows between PEs is not as expected (n_thres_steps)""" - return ret.reshape(1, pe, tmem, n_thres_steps) - - def get_all_meminit_filenames(self, abspath=False): - "Return a list of all .dat memory initializer files used for this node" - dat_files = [] - t_path = self.get_nodeattr("code_gen_dir_ipgen") if abspath else "." - pe = self.get_nodeattr("PE") - output_data_type = self.get_nodeattr("outputDataType") # output precision - o_bitwidth = DataType[output_data_type].bitwidth() - for stage in range(o_bitwidth): - for pe_value in range(pe): - thresh_file = t_path + "/%s_threshs_%s_%s.dat" % ( - self.onnx_node.name, - pe_value, - stage, - ) - dat_files.append(thresh_file) - return dat_files - - def prepare_codegen_rtl_values(self, model): - """All dictionary values produced in this function are to replace - their key value(s) in the RTL template files""" - code_gen_dict = {} - - # TODO check for sortedness and size here? - # RTL component currently always expects 2^N-1 thresholds, but - # sometimes we have fewer due to e.g. narrow range quantization - thresholds = model.get_initializer(self.onnx_node.input[1]) - # add dummy dimension as final dimension (that's what gets packed with next call) - thresholds = np.expand_dims(thresholds, axis=-1) - wdt = self.get_weight_datatype() - bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 4) - t_packed = pack_innermost_dim_as_hex_string( - thresholds, - wdt, - bw_hexdigit, - prefix="", - ) - - t_path = self.get_nodeattr("code_gen_dir_ipgen") - pe = self.get_nodeattr("PE") - output_data_type = self.get_nodeattr("outputDataType") # output precision - o_bitwidth = DataType[output_data_type].bitwidth() - num_channels = self.get_nodeattr("NumChannels") # number of channels - - channel_fold = int(num_channels / pe) - - for stage in range(o_bitwidth): - sn = o_bitwidth - stage - 1 - for pe_value in range(pe): - thresh_file = t_path + "/%s_threshs_%s_%s.dat" % ( - self.onnx_node.name, - pe_value, - stage, - ) - threshs = np.zeros([channel_fold * (2**stage)], dtype="object") - for ch in range(channel_fold): - for i in range(2**stage): - threshs[(ch << stage) + i] = t_packed[ch * pe + pe_value][ - (i << (o_bitwidth - stage)) + 2**sn - 1 - ] - with open(thresh_file, "w") as f: - for val in threshs: - f.write(val + "\n") - code_gen_dict["$THRESHOLDS_PATH$"] = ['"./%s_"' % self.onnx_node.name] - - # Identify the module name - code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ - self.get_verilog_top_module_name() + "_axi_wrapper" - ] - # Set the top module name - AXI wrapper - code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] - - # Identify the module variables - input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision - bias = self.get_nodeattr("activation_bias") # activation bias value - i_bitwidth = DataType[input_data_type].bitwidth() - - code_gen_dict["$N$"] = [str(o_bitwidth)] # output precision - convert bitwidth to string - code_gen_dict["$M$"] = [ - str(i_bitwidth) - ] # input/threshold precision - convert bitwidth to string - code_gen_dict["$C$"] = [str(num_channels)] # number of channels - code_gen_dict["$BIAS$"] = [str(bias)] # activation bias value - code_gen_dict["$PE$"] = [str(pe)] # requires C = M*PE - - # Is the input datatype signed or unsigned? - # The thresholding core needs to know this when comparing weights to inputs - if self.get_input_datatype().signed(): - code_gen_dict["$SIGNED$"] = [str(1)] - else: - code_gen_dict["$SIGNED$"] = [str(0)] - - if bias >= 0: - o_bits = math.ceil(math.log2(2**o_bitwidth + bias)) - else: - o_bits = 1 + math.ceil( - math.log2(-bias if -bias >= 2 ** (o_bitwidth - 1) else 2**o_bitwidth + bias) - ) - - code_gen_dict["$O_BITS$"] = [str(int(o_bits))] - - rt_weights = self.get_nodeattr("runtime_writeable_weights") - code_gen_dict["$USE_AXILITE$"] = [str(rt_weights)] - - depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") - depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") - deep_pipeline = self.get_nodeattr("deep_pipeline") - code_gen_dict["$DEPTH_TRIGGER_URAM$"] = [str(depth_trigger_uram)] - code_gen_dict["$DEPTH_TRIGGER_BRAM$"] = [str(depth_trigger_bram)] - code_gen_dict["$DEEP_PIPELINE$"] = [str(deep_pipeline)] - return code_gen_dict - - def get_rtl_file_list(self): - """Thresholding binary search RTL file list""" - return [ - "axilite_if.v", - "thresholding.sv", - "thresholding_axi.sv", - "thresholding_template_wrapper.v", - ] - - def get_rtl_file_paths(self): - """Get full path of all RTL files""" - rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/" - rtl_file_list = self.get_rtl_file_list() - rtl_file_paths = [rtl_root_dir + file for file in rtl_file_list] - return rtl_file_paths - - def get_rtl_template_data(self, path): - """Return RTL file contents as a template""" - with open(path, "r") as f: - template = f.read() - return template - - def fill_in_rtl_template_data(self, replace_dict, template_data): - """Use attribute values to finn in RTL template placeholders""" - template_data_cp = template_data - for key in replace_dict: - replacement_line = "\n".join(replace_dict[key]) - template_data_cp = template_data_cp.replace(key, replacement_line) - return template_data_cp - - def dump_rtl_data(self, dest_dir, filename, data): - """Dump filled-in-template RTL files for future synthesis step""" - # when generating template files, handle a special case: - # if the filename contains the word "template", replace that - # with the node name to distinguish between instances - filename = filename.replace("template", self.onnx_node.name) - with open(os.path.join(dest_dir, filename), "w") as f: - f.write(data) - return - - def generate_hdl(self, model): - """Prepare HDL files from templates for synthesis""" - # Generate a dictionary of values to put in RTL template - code_gen_dict = self.prepare_codegen_rtl_values(model) - - # Retrieve the destination directory for the final RTL files - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - - for rtl_file_path in self.get_rtl_file_paths(): - # read in original RTL template file - template_data = self.get_rtl_template_data(rtl_file_path) - # apply code generation to templates - data = self.fill_in_rtl_template_data(code_gen_dict, template_data) - # dump filled-in template to destination directory for compilation - file_only_path = rtl_file_path.split("/")[-1] - self.dump_rtl_data(code_gen_dir, file_only_path, data) - - # Before we return - set the 'gen_top_module' attribute for use later - # by PyVerilator and IPI generation - self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) - return - - def code_generation_ipgen(self, model, fpgapart, clk): - self.generate_hdl(model) - - # set ipgen_path and ip_path so that HLS-Synth transformation - # and stich_ip transformation do not complain - # i.e. during the HLSSynthIP() transformation - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - self.set_nodeattr("ipgen_path", code_gen_dir) - self.set_nodeattr("ip_path", code_gen_dir) - return - - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_paths = [code_gen_dir] - verilog_files = [x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list()] - dat_files = self.get_all_meminit_filenames(abspath=True) - single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") - for dat_file in dat_files: - shutil.copy(dat_file, single_src_dir) - - # build the Verilator emulation library - sim = PyVerilator.build( - verilog_files, - build_dir=single_src_dir, - verilog_path=verilog_paths, - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_nodeattr("gen_top_module"), - auto_eval=False, - ) - - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim - - def execute_node(self, context, graph): - # Perform input checks - if self.get_nodeattr("exec_mode") != "rtlsim": - raise Exception( - "Invalid exec_mode value: {}; exec_mode must be set to '{}'".format( - self.get_nodeattr("exec_mode"), "rtlsim" - ) - ) - - node = self.onnx_node - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for Thresholding_Binary_Search") - in_ind += 1 - - # Create a PyVerilator wrapper of the RTLSim .so - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - io_names = self.get_verilog_top_module_intf_names() - istream_name = io_names["s_axis"][0][0] - ostream_name = io_names["m_axis"][0][0] - io_dict = { - "inputs": {istream_name: inp}, - "outputs": {ostream_name: []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"][ostream_name] - - # Manage output data - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - return - - def hls_sname(self): - """Get the naming convention used by Vitis HLS for stream signals - Example: the TDATA for a stream called "out" would be out_V_TDATA. - """ - # no additional prefix/suffix in interface names since this is an RTL component - return "" - - def rtlsim_multi_io(self, sim, io_dict): - "Run rtlsim for this node, supports multiple i/o streams." - - rtlsim_so = self.get_nodeattr("rtlsim_so") - so_dir = os.path.dirname(os.path.realpath(rtlsim_so)) - olcwd = os.getcwd() - os.chdir(so_dir) - - # signal name prefix - # TODO if the interface names on this component get standardized, - # it won't need its own rtlsim_multi_io variant anymore and can just - # use the base class one - sname = "_" - - trace_file = self.get_nodeattr("rtlsim_trace") - if trace_file == "default": - trace_file = self.onnx_node.name + ".vcd" - num_out_values = self.get_number_output_values() - total_cycle_count = rtlsim_multi_io( - sim, - io_dict, - num_out_values, - trace_file=trace_file, - sname=sname, - do_reset=True, - liveness_threshold=pyverilate_get_liveness_threshold_cycles(), - ) - self.set_nodeattr("cycles_rtlsim", total_cycle_count) - os.chdir(olcwd) - - def code_generation_ipi(self): - """Constructs and returns the TCL commands for node instantiation as an RTL - block.""" - rtl_file_list = [x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list()] - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name - cmd = ["file mkdir %s" % source_target] - - for rtl_file in rtl_file_list: - cmd.append( - "add_files -copy_to %s -norecurse %s" - % (source_target, os.path.join(code_gen_dir, rtl_file)) - ) - - # Create an RTL block, not an IP core (-type ip) - cmd.append( - "create_bd_cell -type module -reference %s %s" - % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) - ) - - return cmd - - def get_verilog_top_module_intf_names(self): - """Return a dict of names of input and output interfaces. - The keys reflect the protocols each interface implements: - 'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'. - Values are lists of tuples (axis, aximm) or names (axilite): - 'axis' tuples correspond to the list of node inputs in order, - each tuple is (interface_name, interface_width_bits). - axilite always assumed to be 32 bits and is not tuple (name only). - Each block must have at most one aximm and one axilite.""" - - intf_names = {} - intf_names["clk"] = ["ap_clk"] - intf_names["rst"] = ["ap_rst_n"] - intf_names["s_axis"] = [("in0_V", self.get_instream_width_padded())] - intf_names["m_axis"] = [("out_V", self.get_outstream_width_padded())] - intf_names["aximm"] = [] - intf_names["axilite"] = [] - intf_names["ap_none"] = [] - if self.get_nodeattr("runtime_writeable_weights") == 1: - intf_names["axilite"] = ["s_axilite"] - - return intf_names - - def get_dynamic_config(self, model, address_stride=1): - """Returns a configuration dictionary containing axilite write commands - in order to program the thresholds into the RTL core during runtime. - The default address stride for the weights is 1 byte.""" - - thresholds = model.get_initializer(self.onnx_node.input[1]) - num_channels, num_weights_per_channel = thresholds.shape - - weight_addr_boundary = find_next_power_of_2(num_weights_per_channel) - # Make sure that the next power of 2 (output) is greater than the input - assert weight_addr_boundary >= num_weights_per_channel - - config = {} - channel_cntr = 0 - wdt = self.get_weight_datatype() - bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 4) - for channel in thresholds: - channel_start_addr = channel_cntr * weight_addr_boundary * address_stride - weight_cntr = 0 - addr = 0 - for weight in channel: - key_name = "{}_{}{}_{}{}".format( - "axilite", "ch", str(channel_cntr), "w", str(weight_cntr) - ) - config[key_name] = ( - channel_start_addr + addr, - int( - str( - pack_innermost_dim_as_hex_string( - [weight], - wdt, - bw_hexdigit, - ) - ), - 0, - ), - ) - - weight_cntr += 1 - addr += address_stride - - channel_cntr += 1 - - return config - - def ipgen_singlenode_code(self): - """Normally: Builds the bash script for IP generation.""" - """This is needed for the HLSSynthIP() transformation. - This is an IP, not a HLS node, so therefore provide an empty hook - to prevent any HLS synthesis.""" - pass - - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - - def pragmas(self): - pass From ac1478dac5774ec5d4e599213e37b19ca0ab8967 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 16:39:46 +0000 Subject: [PATCH 063/291] [TBS] Clean up branch for HLS variant only --- finn-rtllib/thresholding/component.xml | 1002 ----------------- .../gui/thresholding_axi_v1_0.gtcl | 4 - finn-rtllib/thresholding/hdl/axilite_if.v | 210 ---- finn-rtllib/thresholding/hdl/thresholding.sv | 357 ------ .../thresholding/hdl/thresholding_axi.sv | 164 --- .../hdl/thresholding_template_wrapper.v | 120 -- finn-rtllib/thresholding/sim/thresh_gen.sv | 45 - finn-rtllib/thresholding/sim/thresholding.tcl | 17 - .../thresholding/sim/thresholding_axi_tb.sv | 314 ------ .../thresholding/sim/thresholding_tb.sv | 274 ----- .../xgui/thresholding_axi_v1_0.tcl | 187 --- src/finn/util/basic.py | 70 -- tests/util/test_basic.py | 60 - 13 files changed, 2824 deletions(-) delete mode 100644 finn-rtllib/thresholding/component.xml delete mode 100644 finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl delete mode 100644 finn-rtllib/thresholding/hdl/axilite_if.v delete mode 100644 finn-rtllib/thresholding/hdl/thresholding.sv delete mode 100644 finn-rtllib/thresholding/hdl/thresholding_axi.sv delete mode 100644 finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v delete mode 100644 finn-rtllib/thresholding/sim/thresh_gen.sv delete mode 100644 finn-rtllib/thresholding/sim/thresholding.tcl delete mode 100644 finn-rtllib/thresholding/sim/thresholding_axi_tb.sv delete mode 100644 finn-rtllib/thresholding/sim/thresholding_tb.sv delete mode 100644 finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl delete mode 100755 tests/util/test_basic.py diff --git a/finn-rtllib/thresholding/component.xml b/finn-rtllib/thresholding/component.xml deleted file mode 100644 index e28a3a2c2d..0000000000 --- a/finn-rtllib/thresholding/component.xml +++ /dev/null @@ -1,1002 +0,0 @@ - - - amd.com - finn - thresholding_axi - 1.0 - - - ap_clk - - - - - - - CLK - - - ap_clk - - - - - - ASSOCIATED_RESET - ap_rst_n - - - ASSOCIATED_BUSIF - s_axilite:s_axis:m_axis - - - FREQ_TOLERANCE_HZ - -1 - - - - - m_axis - - - - - - - TDATA - - - m_axis_tdata - - - - - TVALID - - - m_axis_tvalid - - - - - TREADY - - - m_axis_tready - - - - - - s_axis - - - - - - - TDATA - - - s_axis_tdata - - - - - TVALID - - - s_axis_tvalid - - - - - TREADY - - - s_axis_tready - - - - - - s_axilite - - - - - - - - - AWADDR - - - s_axilite_AWADDR - - - - - AWVALID - - - s_axilite_AWVALID - - - - - AWREADY - - - s_axilite_AWREADY - - - - - WDATA - - - s_axilite_WDATA - - - - - WSTRB - - - s_axilite_WSTRB - - - - - WVALID - - - s_axilite_WVALID - - - - - WREADY - - - s_axilite_WREADY - - - - - BRESP - - - s_axilite_BRESP - - - - - BVALID - - - s_axilite_BVALID - - - - - BREADY - - - s_axilite_BREADY - - - - - ARADDR - - - s_axilite_ARADDR - - - - - ARVALID - - - s_axilite_ARVALID - - - - - ARREADY - - - s_axilite_ARREADY - - - - - RDATA - - - s_axilite_RDATA - - - - - RRESP - - - s_axilite_RRESP - - - - - RVALID - - - s_axilite_RVALID - - - - - RREADY - - - s_axilite_RREADY - - - - - - ap_rst_n - - - - - - - RST - - - ap_rst_n - - - - - - POLARITY - ACTIVE_LOW - - - - - - - s_axilite - s_axilite - - reg0 - reg0 - 0x0 - 4096 - 32 - register - - - - - - - xilinx_anylanguagesynthesis - Synthesis - :vivado.xilinx.com:synthesis - Verilog - thresholding_axi_wrapper - - xilinx_anylanguagesynthesis_view_fileset - - - - viewChecksum - fd0bd85b - - - - - xilinx_anylanguagebehavioralsimulation - Simulation - :vivado.xilinx.com:simulation - Verilog - thresholding_axi_wrapper - - xilinx_anylanguagebehavioralsimulation_view_fileset - - - - viewChecksum - fd0bd85b - - - - - xilinx_xpgui - UI Layout - :vivado.xilinx.com:xgui.ui - - xilinx_xpgui_view_fileset - - - - viewChecksum - fc6b9b63 - - - - - xilinx_utilityxitfiles - Utility XIT/TTCL - :vivado.xilinx.com:xit.util - - xilinx_utilityxitfiles_view_fileset - - - - viewChecksum - 8b0215cd - - - - - - - ap_clk - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - ap_rst_n - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_AWVALID - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_AWREADY - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_AWADDR - - in - - 5 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_WVALID - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_WREADY - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_WDATA - - in - - 31 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_WSTRB - - in - - 3 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 1 - - - - - s_axilite_BVALID - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_BREADY - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_BRESP - - out - - 1 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_ARVALID - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_ARREADY - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_ARADDR - - in - - 5 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_RVALID - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_RREADY - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_RDATA - - out - - 31 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_RRESP - - out - - 1 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axis_tready - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axis_tvalid - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axis_tdata - - in - - 15 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - m_axis_tready - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 1 - - - - - m_axis_tvalid - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_tdata - - out - - 7 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - - - N - N - 4 - - - K - K - 16 - - - C - C - 1 - - - PE - Pe - 1 - - - SIGNED - Signed - true - - - FPARG - Fparg - false - - - BIAS - Bias - 0 - - - CF - Cf - 1 - - - ADDR_BITS - Addr Bits - 6 - - - O_BITS - O Bits - 4 - - - - - - choice_list_9d8b0d81 - ACTIVE_HIGH - ACTIVE_LOW - - - - - xilinx_anylanguagesynthesis_view_fileset - - hdl/thresholding.sv - systemVerilogSource - - - hdl/thresholding_axi.sv - systemVerilogSource - - - hdl/thresholding_axi_wrapper.v - verilogSource - CHECKSUM_7b8c102d - - - hdl/axilite_if.v - verilogSource - CHECKSUM_69d1ba26 - xil_defaultlib - - - - xilinx_anylanguagebehavioralsimulation_view_fileset - - hdl/thresholding.sv - systemVerilogSource - - - hdl/thresholding_axi.sv - systemVerilogSource - - - hdl/thresholding_axi_wrapper.v - verilogSource - - - hdl/axilite_if.v - verilogSource - USED_IN_ipstatic - xil_defaultlib - - - - xilinx_xpgui_view_fileset - - xgui/thresholding_axi_v1_0.tcl - tclSource - CHECKSUM_fc6b9b63 - XGUI_VERSION_2 - - - - xilinx_utilityxitfiles_view_fileset - - gui/thresholding_axi_v1_0.gtcl - GTCL - - - - MultiThreshold - - - N - Output Precision - 4 - - - K - Input Precision - 16 - - - C - Channels - 1 - - - PE - Pe - 1 - - - SIGNED - Signed Inputs - true - - - FPARG - Floating-Point Inputs - false - - - BIAS - Bias - 0 - - - CF - Channel Fold - 1 - - - - false - - - - - - ADDR_BITS - Address Bits - 6 - - - - false - - - - - - O_BITS - Output Value Width - 4 - - - - false - - - - - - Component_Name - thresholding_axi_wrapper_v1_0 - - - - - - virtex7 - qvirtex7 - versal - kintex7 - kintex7l - qkintex7 - qkintex7l - akintex7 - artix7 - artix7l - aartix7 - qartix7 - zynq - qzynq - azynq - spartan7 - aspartan7 - virtexu - zynquplus - virtexuplus - virtexuplusHBM - virtexuplus58g - kintexuplus - artixuplus - kintexu - - - /UserIP - - thresholding_axi - level_1 - package_project - 2 - - user.org:user:thresholding_axi_wrapper:1.0 - - 2023-06-27T05:47:20Z - - - - - - 2022.2 - - - - - - - - - - - - - - diff --git a/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl b/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl deleted file mode 100644 index 90d73ede7e..0000000000 --- a/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl +++ /dev/null @@ -1,4 +0,0 @@ -# This file is automatically written. Do not modify. -proc gen_USERPARAMETER_CF_VALUE {C PE } {expr $C/$PE} -proc gen_USERPARAMETER_ADDR_BITS_VALUE {C PE N } {expr int(ceil(log($C/$PE)/log(2))+ceil(log($PE)/log(2))+$N+2)} -proc gen_USERPARAMETER_O_BITS_VALUE {BIAS N } {expr int(ceil($BIAS >= 0? log(pow(2,$N)+$BIAS)/log(2) : 1+log(-$BIAS >= pow(2,$N-1)? -$BIAS : pow(2,$N)+$BIAS)/log(2)))} diff --git a/finn-rtllib/thresholding/hdl/axilite_if.v b/finn-rtllib/thresholding/hdl/axilite_if.v deleted file mode 100644 index bdd4de288e..0000000000 --- a/finn-rtllib/thresholding/hdl/axilite_if.v +++ /dev/null @@ -1,210 +0,0 @@ -/* - Copyright (c) 2020, Xilinx - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of FINN nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -module axi4lite_if -#( - parameter ADDR_WIDTH = 32, - parameter DATA_WIDTH = 32,//AXI4 spec requires this to be strictly 32 or 64 - parameter IP_DATA_WIDTH = 64//can be any power-of-2 multiple of DATA_WIDTH -) -( -//system signals -input aclk, -input aresetn,//active low, asynchronous assertion and synchronous deassertion - -//Write channels -//write address -output reg awready, -input awvalid, -input [ADDR_WIDTH-1:0] awaddr, -input [2:0] awprot, -//write data -output reg wready, -input wvalid, -input [DATA_WIDTH-1:0] wdata, -input [(DATA_WIDTH/8)-1:0] wstrb, -//burst response -input bready, -output reg bvalid, -output reg [1:0] bresp,//NOTE: 00 = OKAY, 10 = SLVERR (write error) - -//Read channels -//read address -output reg arready, -input arvalid, -input [ADDR_WIDTH-1:0] araddr, -input [2:0] arprot, -//read data -input rready, -output reg rvalid, -output reg [1:0] rresp,//NOTE: 00 = OKAY, 10 = SLVERR (read error) -output reg [DATA_WIDTH-1:0] rdata, - -//IP-side interface -output reg ip_en, -output reg ip_wen, -output reg [ADDR_WIDTH-1:0] ip_addr, -output [IP_DATA_WIDTH-1:0] ip_wdata, -input ip_rack, -input [IP_DATA_WIDTH-1:0] ip_rdata -); - -localparam RESP_OKAY = 2'b00; -localparam RESP_SLVERR = 2'b10; -//get ceil(log2(ceil(IP_DATA_WIDTH/DATA_WIDTH))) -localparam NFOLDS_LOG = $clog2((IP_DATA_WIDTH + DATA_WIDTH - 1) / DATA_WIDTH); - -reg internal_ren; -reg internal_wen; -reg internal_wack; -reg [ADDR_WIDTH-1:0] internal_raddr; -reg [ADDR_WIDTH-1:0] internal_waddr; -reg [DATA_WIDTH-1:0] internal_wdata; -wire [DATA_WIDTH-1:0] internal_rdata; -reg internal_error = 0; - -//check DATA_WIDTH -initial begin - if(DATA_WIDTH != 32 & DATA_WIDTH != 64) begin - $display("AXI4Lite DATA_WIDTH must be 32 or 64"); - $finish; - end -end - -//transaction state machine -localparam STATE_IDLE = 0, - STATE_READ = 1, - STATE_WRITE = 2; - -reg [1:0] state; - -always @(posedge aclk or negedge aresetn) - if(~aresetn) - state <= STATE_IDLE; - else case(state) - STATE_IDLE: - if(awvalid & wvalid) - state <= STATE_WRITE; - else if(arvalid) - state <= STATE_READ; - STATE_READ: - if(rvalid & rready) - state <= STATE_IDLE; - STATE_WRITE: - if(bvalid & bready) - state <= STATE_IDLE; - default: state <= STATE_IDLE; - endcase - -//write-related internal signals -always @(*) begin - internal_waddr = awaddr >> $clog2(DATA_WIDTH/8); - internal_wdata = wdata; - internal_wen = (state == STATE_IDLE) & awvalid & wvalid; -end - -always @(posedge aclk) begin - awready <= internal_wen; - wready <= internal_wen; -end - -//read-related internal signals -always @(*) begin - internal_raddr = araddr >> $clog2(DATA_WIDTH/8); - internal_ren = (state == STATE_IDLE) & ~internal_wen & arvalid; -end - -always @(posedge aclk) - arready <= internal_ren; - -wire write_to_last_fold; - -always @(posedge aclk) begin - ip_wen <= write_to_last_fold; - ip_en <= internal_ren | write_to_last_fold; - if(internal_ren | write_to_last_fold) - ip_addr <= internal_ren ? (internal_raddr >> NFOLDS_LOG) : (internal_waddr >> NFOLDS_LOG); - internal_wack <= internal_wen; -end - -genvar i; -reg [(1<> (internal_rfold*DATA_WIDTH); - always @(posedge aclk) - if(internal_ren) - internal_rfold <= internal_raddr[NFOLDS_LOG-1:0]; - for(i=0; i<(1< - * - * @description - * Produces the N-bit count of those among 2^N-1 thresholds that are not - * larger than the corresponding input: - * y = Σ(T_i <= x) - * The result is computed by binary search. The runtime-configurable - * thresholds must be written in ascending order: - * i < j => T_i < T_j - * The design supports channel folding allowing each input to be processed - * with respect to a selectable set of thresholds. The corresponding - * threshold configuration relies on a channel address prefix. Inputs are - * accompanied by a channel selector. - * - * Parameter Layout as seen on AXI-Lite (row by row): - * | Base \ Offs | 0 1 2 ... 2^N-2 2^N-1 - * ---------+--------------------------------+------------------------------------ - * Chnl #0 | 0 | T_0 T_1 T_2 ... T_{2^N-2} 'x - * Chnl #1 | 2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x - * Chnl #c | ((c/PE)*$clog2(PE) + c%PE)*2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x - * - *****************************************************************************/ -module thresholding #( - int unsigned N, // output precision - int unsigned K, // input/threshold precision - int unsigned C, // number of channels - int unsigned PE, // parallel processing elements - - bit SIGNED = 1, // signed inputs - bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa - int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] - - // Initial Thresholds - parameter THRESHOLDS_PATH = "", - bit USE_CONFIG = 1, - - // Force Use of On-Chip Memory Blocks - int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio) - int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM - bit DEEP_PIPELINE = 0, - - localparam int unsigned CF = C/PE, // Channel fold - localparam int unsigned O_BITS = BIAS >= 0? - /* unsigned */ $clog2(2**N+BIAS) : - /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) -)( - // Global Control - input logic clk, - input logic rst, - - // Threshold Configuration - input logic cfg_en, - input logic cfg_we, - input logic [$clog2(CF)+$clog2(PE)+N-1:0] cfg_a, - input logic [K-1:0] cfg_d, - output logic cfg_rack, - output logic [K-1:0] cfg_q, - - // Input Stream - output logic irdy, - input logic ivld, - input logic [PE-1:0][K-1:0] idat, - - // Output Stream - input logic ordy, - output logic ovld, - output logic [PE-1:0][O_BITS-1:0] odat -); - - // Parameter Constraints Checking - initial begin - if(CF*PE != C) begin - $error("Parallelism PE=%0d is not a multiple of channel count C=%0d.", PE, C); - $finish; - end - end - - // Operations within Pipeline - typedef enum logic [1:0] { - NOP = 2'b00, // No operation - TH = 2'b01, // Thresholding - WR = 2'b11, // Write (initialization) - RB = 2'b10, // Readback (validation) - CFG = 2'b1x // Config op (pointer-preserving) - } op_e; - - // Pipeline Link Type - typedef logic [$clog2(CF)+N-1:0] ptr_t; - typedef logic [K -1:0] val_t; - typedef struct packed { - op_e op; - ptr_t ptr; // WR/RB: address; TH: result - val_t val; // WR/RB: threshold value; TH: input value - } pipe_t; - - //----------------------------------------------------------------------- - // Pipeline Feed - // - configuration always takes precedence - // - number of pending thresholding ops capped to N+3 - // across pipeline and output FIFO: pipe:N + A:1 + B:1 + 1 - localparam int unsigned MAX_PENDING = (DEEP_PIPELINE+1)*N + 3; - pipe_t pipe[PE][N+1]; - if(1) begin : blkFeed - - // Thresholding Input Guard ensuring Output FIFO is never overrun - logic signed [$clog2(MAX_PENDING):0] GuardSem = MAX_PENDING-1; // MAX_PENDING-1, ..., 0, -1 - uwire th_full = GuardSem[$left(GuardSem)]; - always_ff @(posedge clk) begin - if(rst) GuardSem <= MAX_PENDING-1; - else begin - automatic logic dec = !(USE_CONFIG && cfg_en) && !th_full && ivld; - automatic logic inc = ovld && ordy; - GuardSem <= GuardSem + (inc == dec? 0 : inc? 1 : -1); - end - end - - // PE Configuration Address Decoding - uwire cfg_sel[PE]; - if(PE == 1) assign cfg_sel[0] = 1; - else begin - for(genvar pe = 0; pe < PE; pe++) begin - assign cfg_sel[pe] = USE_CONFIG && cfg_en && (cfg_a[N+:$clog2(PE)] == pe); - end - end - - uwire ptr_t iptr; - assign iptr[0+:N] = cfg_a[0+:N]; - if(CF > 1) begin - // Channel Fold Rotation - logic [$clog2(CF)-1:0] CnlCnt = 0; - logic CnlLst = 0; - always_ff @(posedge clk) begin - if(rst) begin - CnlCnt <= 0; - CnlLst <= 0; - end - else if(!(USE_CONFIG && cfg_en) && !th_full && ivld) begin - CnlCnt <= CnlCnt + (CnlLst? 1-CF : 1); - CnlLst <= CnlCnt == CF-2; - end - end - - assign iptr[N+:$clog2(CF)] = USE_CONFIG && cfg_en? cfg_a[N+$clog2(PE)+:$clog2(CF)] : CnlCnt; - end - - for(genvar pe = 0; pe < PE; pe++) begin - assign pipe[pe][0] = '{ - op: USE_CONFIG && cfg_en? - (!cfg_sel[pe]? NOP : cfg_we? WR : RB) : - (ivld && !th_full? TH : NOP), - ptr: iptr, - val: !(USE_CONFIG && cfg_en)? idat[pe] : cfg_we? cfg_d : 0 - }; - end - - assign irdy = !(USE_CONFIG && cfg_en) && !th_full; - end : blkFeed - - //----------------------------------------------------------------------- - // Free-Running Thresholding Pipeline - for(genvar stage = 0; stage < N; stage++) begin : genStages - - localparam int unsigned SN = N-1-stage; - for(genvar pe = 0; pe < PE; pe++) begin : genPE - uwire pipe_t p = pipe[pe][stage]; - uwire cs = (p.ptr[SN:0] == 2**SN-1); - - // Threshold Memory - val_t Thresh; // Read-out register - if(1) begin : blkThresh - localparam int unsigned DEPTH = CF * 2**stage; - localparam RAM_STYLE = - DEPTH_TRIGGER_URAM && (DEPTH >= DEPTH_TRIGGER_URAM)? "ultra" : - DEPTH_TRIGGER_BRAM && (DEPTH >= DEPTH_TRIGGER_BRAM)? "block" : - // If BRAM trigger defined, force distributed memory below if Vivado may be tempted to use BRAM nonetheless. - DEPTH_TRIGGER_BRAM && (DEPTH >= 64)? "distributed" : "auto"; - - (* RAM_STYLE = RAM_STYLE *) - val_t Threshs[DEPTH]; - if(THRESHOLDS_PATH != "") begin - initial $readmemh($sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage), Threshs); - end - - if(USE_CONFIG) begin : genThreshMem - uwire we = (p.op ==? WR) && cs; - if((CF == 1) && (stage == 0)) begin - always @(posedge clk) begin - if(we) Threshs[0] <= p.val; - end - end - else begin - uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1]; - always @(posedge clk) begin - if(we) Threshs[addr] <= p.val; - end - end - end : genThreshMem - - if((CF == 1) && (stage == 0)) begin - assign Thresh = Threshs[0]; - end - else begin - uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1]; - always_ff @(posedge clk) begin - Thresh <= Threshs[addr]; - end - end - - end : blkThresh - - // Pipeline State - pipe_t P = '{ op: NOP, default: 'x }; - logic Reval = 0; - always_ff @(posedge clk) begin - if(rst) begin - P <= '{ op: NOP, default: 'x }; - Reval <= 0; - end - else begin - P <= p; - Reval <= (p.op ==? RB) && cs; - end - end - - logic cmp; - if(!SIGNED) assign cmp = $unsigned(Thresh) <= $unsigned(P.val); - else if(!FPARG) assign cmp = $signed(Thresh) <= $signed(P.val); - else begin : blkSignedFloat - uwire mag_eq = Thresh[K-2:0] == P.val[K-2:0]; - uwire mag_le = Thresh[K-2:0] <= P.val[K-2:0]; - always_comb begin - unique case({Thresh[K-1], P.val[K-1]}) - 2'b00: cmp = mag_le; - 2'b01: cmp = 0; - 2'b10: cmp = 1; - 2'b11: cmp = !mag_le || mag_eq; - default: cmp = 'x; - endcase - end - end : blkSignedFloat - - // Pipeline State Update - pipe_t pp; - always_comb begin - pp = P; - if(P.op !=? CFG) pp.ptr[SN] = cmp; - if(Reval) pp.val = Thresh; - end - - // Pipeline State Forward (potentially additional register) - pipe_t pf; - if(!DEEP_PIPELINE) assign pf = pp; - else begin - pipe_t Pf = '{ op: NOP, default: 'x }; - always_ff @(posedge clk) begin - if(rst) Pf <= '{ op: NOP, default: 'x }; - else Pf <= pp; - end - assign pf = Pf; - end - - assign pipe[pe][stage+1] = pf; - - end : genPE - end : genStages - - //----------------------------------------------------------------------- - // Configuration Readback - always_comb begin - cfg_rack = 0; - cfg_q = 0; - foreach(pipe[pe]) begin - automatic pipe_t p = pipe[pe][N]; - cfg_rack |= p.op ==? RB; - cfg_q |= p.val; - end - end - - //----------------------------------------------------------------------- - // Stream Output through FIFO - // - Depth of N + Output Reg to allow pipe to drain entirely under backpressure - // - Typically mapped to an SRL shift register - if(1) begin : blkStreamOutput - localparam int unsigned A_DEPTH = MAX_PENDING - 1; - logic [PE-1 : 0][N-1 : 0] ADat[A_DEPTH]; - logic signed [$clog2(A_DEPTH):0] APtr = '1; // -1, 0, 1, ..., A_DEPTH-1 - uwire avld = !APtr[$left(APtr)]; - - logic [PE-1:0][N-1:0] BDat = 'x; - logic BVld = 0; - - uwire aload = pipe[0][N].op ==? TH; - uwire bload = !BVld || ordy; - - always_ff @(posedge clk) begin - if(aload) begin - assert(APtr < $signed(A_DEPTH-1)) else begin - $error("Overrun after failing stream guard."); - $stop; - end - foreach(pipe[pe]) ADat[0][pe] <= pipe[pe][N].ptr; - for(int unsigned i = 1; i < A_DEPTH; i++) ADat[i] <= ADat[i-1]; - end - end - always_ff @(posedge clk) begin - if(rst) APtr <= '1; - else APtr <= APtr + (aload == (avld && bload)? 0 : aload? 1 : -1); - end - always_ff @(posedge clk) begin - if(rst) begin - BDat <= 'x; - BVld <= 0; - end - else if(bload) begin - BDat <= ADat[APtr]; - BVld <= avld; - end - end - - assign ovld = BVld; - for(genvar pe = 0; pe < PE; pe++) begin - assign odat[pe] = BDat[pe] + BIAS; - end - end : blkStreamOutput - -endmodule : thresholding diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv deleted file mode 100644 index 1f235b9486..0000000000 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ /dev/null @@ -1,164 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief All-AXI interface adapter for thresholding module. - * @author Thomas B. Preußer - * - * @description - * This AXI adapter fits the core thresholding functionality: - * - with AXI stream data interfaces with flow control - * - with implicit round-robin channel rotation as used by FINN, and - * - performs aligned byte address to parameter word address translation. - *****************************************************************************/ - -module thresholding_axi #( - int unsigned N, // output precision - int unsigned K, // input/threshold precision - int unsigned C = 1, // Channels - int unsigned PE = 1, // Processing Parallelism, requires C = k*PE - - bit SIGNED = 1, // signed inputs - bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa - int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] - - // Initial Thresholds - parameter THRESHOLDS_PATH = "", - - bit USE_AXILITE, // Implement AXI-Lite for threshold read/write - - // Force Use of On-Chip Memory Blocks - int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio) - int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM - bit DEEP_PIPELINE = 0, - - localparam int unsigned CF = C/PE, // Channel Fold - localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2, - localparam int unsigned O_BITS = BIAS >= 0? - /* unsigned */ $clog2(2**N+BIAS) : - /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) -)( - //- Global Control ------------------ - input logic ap_clk, - input logic ap_rst_n, - - //- AXI Lite ------------------------ - // Writing - input logic s_axilite_AWVALID, - output logic s_axilite_AWREADY, - input logic [ADDR_BITS-1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored - - input logic s_axilite_WVALID, - output logic s_axilite_WREADY, - input logic [31:0] s_axilite_WDATA, - input logic [ 3:0] s_axilite_WSTRB, - - output logic s_axilite_BVALID, - input logic s_axilite_BREADY, - output logic [1:0] s_axilite_BRESP, - - // Reading - input logic s_axilite_ARVALID, - output logic s_axilite_ARREADY, - input logic [ADDR_BITS-1:0] s_axilite_ARADDR, - - output logic s_axilite_RVALID, - input logic s_axilite_RREADY, - output logic [31:0] s_axilite_RDATA, - output logic [ 1:0] s_axilite_RRESP, - - //- AXI Stream - Input -------------- - output logic s_axis_tready, - input logic s_axis_tvalid, - input logic [((PE*K+7)/8)*8-1:0] s_axis_tdata, - - //- AXI Stream - Output ------------- - input logic m_axis_tready, - output logic m_axis_tvalid, - output logic [((PE*O_BITS+7)/8)*8-1:0] m_axis_tdata -); - - //----------------------------------------------------------------------- - // AXI-lite Configuration Interface - uwire cfg_en; - uwire cfg_we; - uwire [ADDR_BITS-3:0] cfg_a; - uwire [K -1:0] cfg_d; - uwire cfg_rack; - uwire [K -1:0] cfg_q; - - if(USE_AXILITE) begin - uwire [ADDR_BITS-1:0] cfg_a0; - axi4lite_if #(.ADDR_WIDTH(ADDR_BITS), .DATA_WIDTH(32), .IP_DATA_WIDTH(K)) axi ( - .aclk(ap_clk), .aresetn(ap_rst_n), - - .awready(s_axilite_AWREADY), .awvalid(s_axilite_AWVALID), .awaddr(s_axilite_AWADDR), .awprot('x), - .wready(s_axilite_WREADY), .wvalid(s_axilite_WVALID), .wdata(s_axilite_WDATA), .wstrb(s_axilite_WSTRB), - .bready(s_axilite_BREADY), .bvalid(s_axilite_BVALID), .bresp(s_axilite_BRESP), - - .arready(s_axilite_ARREADY), .arvalid(s_axilite_ARVALID), .araddr(s_axilite_ARADDR), .arprot('x), - .rready(s_axilite_RREADY), .rvalid(s_axilite_RVALID), .rresp(s_axilite_RRESP), .rdata(s_axilite_RDATA), - - .ip_en(cfg_en), .ip_wen(cfg_we), .ip_addr(cfg_a0), .ip_wdata(cfg_d), - .ip_rack(cfg_rack), .ip_rdata(cfg_q) - ); - assign cfg_a = cfg_a0[ADDR_BITS-3:0]; - always_ff @(posedge ap_clk) begin - assert(!ap_rst_n || !cfg_en || (cfg_a0[ADDR_BITS-2+:2] === 3'h0)) else begin - $error("%m: Spurious high address bits."); - $stop; - end - end - end - else begin - assign cfg_en = 0; - assign cfg_we = 'x; - assign cfg_a = 'x; - assign cfg_d = 'x; - end - - //----------------------------------------------------------------------- - // Kernel Implementation - thresholding #( - .N(N), .K(K), .C(C), .PE(PE), - .SIGNED(SIGNED), .FPARG(FPARG), .BIAS(BIAS), - .THRESHOLDS_PATH(THRESHOLDS_PATH), .USE_CONFIG(USE_AXILITE), - .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM), - .DEEP_PIPELINE(DEEP_PIPELINE) - ) impl ( - .clk(ap_clk), .rst(!ap_rst_n), - - .cfg_en, .cfg_we, .cfg_a, .cfg_d, - .cfg_rack, .cfg_q, - - .irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat(s_axis_tdata), - .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata) - ); - -endmodule : thresholding_axi diff --git a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v deleted file mode 100644 index ef76a23cbc..0000000000 --- a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Copyright (c) 2023, Xilinx - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * * Neither the name of FINN nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @author Thomas B. Preußer - * @brief Verilog wrapper for IP packaging. - */ - -module $MODULE_NAME_AXI_WRAPPER$ #( - parameter N = $N$, // output precision - parameter K = $M$, // input/threshold precision - parameter C = $C$, // Channels - parameter PE = $PE$, // Processing Parallelism, requires C = k*PE - - parameter SIGNED = $SIGNED$, // signed inputs - parameter FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa - parameter BIAS = $BIAS$, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] - - parameter THRESHOLDS_PATH = $THRESHOLDS_PATH$, // Directory with initial threshold data - parameter USE_AXILITE = $USE_AXILITE$, // Implement AXI-Lite for threshold read/write - - // Force Use of On-Chip Memory Blocks - parameter DEPTH_TRIGGER_URAM = $DEPTH_TRIGGER_URAM$, // if non-zero, local mems of this depth or more go into URAM (prio) - parameter DEPTH_TRIGGER_BRAM = $DEPTH_TRIGGER_BRAM$, // if non-zero, local mems of this depth or more go into BRAM - parameter DEEP_PIPELINE = $DEEP_PIPELINE$, // [bit] extra pipeline stages for easier timing closure - - parameter O_BITS = $O_BITS$ -)( - // Global Control - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axilite:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) - (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) - input ap_clk, - (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) - input ap_rst_n, - - //- AXI Lite ------------------------ - // Writing - input s_axilite_AWVALID, - output s_axilite_AWREADY, - input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored - - input s_axilite_WVALID, - output s_axilite_WREADY, - input [31:0] s_axilite_WDATA, - input [ 3:0] s_axilite_WSTRB, - - output s_axilite_BVALID, - input s_axilite_BREADY, - output [1:0] s_axilite_BRESP, - - // Reading - input s_axilite_ARVALID, - output s_axilite_ARREADY, - input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_ARADDR, - - output s_axilite_RVALID, - input s_axilite_RREADY, - output [31:0] s_axilite_RDATA, - output [ 1:0] s_axilite_RRESP, - - //- AXI Stream - Input -------------- - output in0_V_TREADY, - input in0_V_TVALID, - input [((PE*K+7)/8)*8-1:0] in0_V_TDATA, - - //- AXI Stream - Output ------------- - input out_V_TREADY, - output out_V_TVALID, - output [((PE*O_BITS+7)/8)*8-1:0] out_V_TDATA -); - - thresholding_axi #( - .N(N), .K(K), .C(C), .PE(PE), - .SIGNED(SIGNED), - .FPARG(FPARG), - .BIAS(BIAS), - .THRESHOLDS_PATH(THRESHOLDS_PATH), - .USE_AXILITE(USE_AXILITE), - .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), - .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM), - .DEEP_PIPELINE(DEEP_PIPELINE) - ) core ( - .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), - - .s_axilite_AWVALID(s_axilite_AWVALID), .s_axilite_AWREADY(s_axilite_AWREADY), .s_axilite_AWADDR(s_axilite_AWADDR), - .s_axilite_WVALID(s_axilite_WVALID), .s_axilite_WREADY(s_axilite_WREADY), .s_axilite_WDATA(s_axilite_WDATA), .s_axilite_WSTRB(s_axilite_WSTRB), - .s_axilite_BVALID(s_axilite_BVALID), .s_axilite_BREADY(s_axilite_BREADY), .s_axilite_BRESP(s_axilite_BRESP), - - .s_axilite_ARVALID(s_axilite_ARVALID), .s_axilite_ARREADY(s_axilite_ARREADY), .s_axilite_ARADDR(s_axilite_ARADDR), - .s_axilite_RVALID(s_axilite_RVALID), .s_axilite_RREADY(s_axilite_RREADY), .s_axilite_RDATA(s_axilite_RDATA), .s_axilite_RRESP(s_axilite_RRESP), - .s_axis_tready(in0_V_TREADY), .s_axis_tvalid(in0_V_TVALID), .s_axis_tdata(in0_V_TDATA), - .m_axis_tready(out_V_TREADY), .m_axis_tvalid(out_V_TVALID), .m_axis_tdata(out_V_TDATA) - ); - -endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/thresholding/sim/thresh_gen.sv b/finn-rtllib/thresholding/sim/thresh_gen.sv deleted file mode 100644 index 713723aafa..0000000000 --- a/finn-rtllib/thresholding/sim/thresh_gen.sv +++ /dev/null @@ -1,45 +0,0 @@ -module thresh_gen; - localparam int unsigned K = 9; - localparam int unsigned N = 4; - localparam int unsigned C = 6; - - typedef logic [K-1:0] thresh_t; - localparam thresh_t THRESHOLDS[C][2**N-1] = '{ - '{ 'h00, 'h01, 'h02, 'h03, 'h04, 'h05, 'h06, 'h07, 'h08, 'h09, 'h0a, 'h0b, 'h0c, 'h0d, 'h0e }, - '{ 'h10, 'h11, 'h12, 'h13, 'h14, 'h15, 'h16, 'h17, 'h18, 'h19, 'h1a, 'h1b, 'h1c, 'h1d, 'h1e }, - '{ 'h20, 'h21, 'h22, 'h23, 'h24, 'h25, 'h26, 'h27, 'h28, 'h29, 'h2a, 'h2b, 'h2c, 'h2d, 'h2e }, - '{ 'h30, 'h31, 'h32, 'h33, 'h34, 'h35, 'h36, 'h37, 'h38, 'h39, 'h3a, 'h3b, 'h3c, 'h3d, 'h3e }, - '{ 'h40, 'h41, 'h42, 'h43, 'h44, 'h45, 'h46, 'h47, 'h48, 'h49, 'h4a, 'h4b, 'h4c, 'h4d, 'h4e }, - '{ 'h50, 'h51, 'h52, 'h53, 'h54, 'h55, 'h56, 'h57, 'h58, 'h59, 'h5a, 'h5b, 'h5c, 'h5d, 'h5e } - }; - localparam THRESHOLDS_PATH = "./"; - - localparam int unsigned PE = 2; - localparam int unsigned CF = C/PE; - - for(genvar stage = 0; stage < N; stage++) begin - localparam int unsigned SN = N-1-stage; - for(genvar pe = 0; pe < PE; pe++) begin - initial begin - automatic string file = $sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage); - - automatic thresh_t threshs[CF * 2**stage]; - for(int unsigned c = 0; c < CF; c++) begin - for(int unsigned i = 0; i < 2**stage; i++) begin - threshs[(c << stage) + i] = THRESHOLDS[c*PE + pe][(i<<(N-stage)) + 2**SN-1]; - end - end - - $writememh(file, threshs); - end - end - end - - // Quit after running all initializers - initial begin - #1ns; - $display("Generation done."); - $finish; - end - -endmodule : thresh_gen diff --git a/finn-rtllib/thresholding/sim/thresholding.tcl b/finn-rtllib/thresholding/sim/thresholding.tcl deleted file mode 100644 index 82dc59deb1..0000000000 --- a/finn-rtllib/thresholding/sim/thresholding.tcl +++ /dev/null @@ -1,17 +0,0 @@ -create_project -force thresholding thresholding.vivado -part xcvc1902-vsva2197-2MP-e-S -set_property board_part xilinx.com:vck190:part0:2.2 [current_project] - -read_verilog hdl/axilite_if.v -read_verilog -sv { hdl/thresholding.sv hdl/thresholding_axi.sv } - -set simset [current_fileset -simset] -set_property -name xsim.simulate.log_all_signals -value true -objects $simset -set_property -name xsim.simulate.runtime -value all -objects $simset -add_files -fileset $simset { sim/thresholding_tb.sv sim/thresholding_axi_tb.sv } - -foreach top { thresholding_tb thresholding_axi_tb } { - set_property top $top $simset - - launch_simulation - close_sim -} diff --git a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv deleted file mode 100644 index 918f539d15..0000000000 --- a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv +++ /dev/null @@ -1,314 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for thresholding_axi. - * @author Monica Chiosa - * - */ - -module thresholding_axi_tb #( - int unsigned N = 4, // output precision - int unsigned C = 6, // number of channels - int unsigned PE = 2, - real M0 = 7.3, // slope of the uniform thresholding line - real B0 = 3.1, // offset of the uniform thresholding line - bit THROTTLED = 1, - - localparam int unsigned CF = C/PE, // Channel Fold - localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2 -); - - //----------------------------------------------------------------------- - // Design Geometry - - // For each channel = [0,channel): - // M_channel = M0 + CX*channel - // B_channel = B0 + CX*channel - // Input/threshold precision computed according with the maximum posible value - localparam real CX = 1.375; - localparam int unsigned K = $clog2((2**N-1)*(M0+C*CX) + (B0+C*CX)); // unused sign + magnitude - localparam int unsigned C_BITS = C < 2? 1 : $clog2(C); - - localparam int unsigned MST_STRM_WROUNDS = 503; - - typedef int unsigned threshs_t[C][2**N-1]; - function threshs_t init_thresholds(); - automatic threshs_t res; - for(int unsigned c = 0; c < C; c++) begin - automatic real m = M0 + c*CX; - automatic real b = B0 + c*CX; - foreach(res[c][i]) begin - res[c][i] = int'($ceil(m*i + b)); - end - end - return res; - endfunction : init_thresholds - localparam threshs_t THRESHS = init_thresholds(); - - //----------------------------------------------------------------------- - // Clock and Reset Control - logic clk = 0; - always #5ns clk = !clk; - logic rst = 1; - initial begin - #10ns; - @(posedge clk); - rst <= 0; - end - - //----------------------------------------------------------------------- - // DUT - logic s_axilite_AWVALID; - uwire s_axilite_AWREADY; - logic [ADDR_BITS-1:0] s_axilite_AWADDR; // lowest 2 bits (byte selectors) are ignored - logic s_axilite_WVALID; - uwire s_axilite_WREADY; - logic [ 31:0] s_axilite_WDATA; - uwire s_axilite_BVALID; - logic s_axilite_BREADY; - uwire [ 1:0] s_axilite_BRESP; - logic s_axilite_ARVALID; - uwire s_axilite_ARREADY; - logic [ADDR_BITS-1:0] s_axilite_ARADDR; - uwire s_axilite_RVALID; - uwire s_axilite_RREADY = 1; - uwire [ 31:0] s_axilite_RDATA; - uwire [ 1:0] s_axilite_RRESP; - - uwire irdy; - logic ivld; - logic [PE-1:0][K-1:0] idat; - - logic ordy = 0; - uwire ovld; - uwire [PE-1:0][N-1:0] odat; - - thresholding_axi #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(0), .USE_AXILITE(1)) dut ( - .ap_clk(clk), .ap_rst_n(!rst), - - // Configuration - .s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR, - .s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB('1), - .s_axilite_BVALID, .s_axilite_BREADY, .s_axilite_BRESP, - .s_axilite_ARVALID, .s_axilite_ARREADY, .s_axilite_ARADDR, - .s_axilite_RVALID, .s_axilite_RREADY, .s_axilite_RDATA, .s_axilite_RRESP, - - // Stream Processing - .s_axis_tready(irdy), .s_axis_tvalid(ivld), .s_axis_tdata(idat), - .m_axis_tready(ordy), .m_axis_tvalid(ovld), .m_axis_tdata(odat) - ); - - //----------------------------------------------------------------------- - // Input Stimuli - typedef logic [PE-1:0][K-1:0] input_t; - typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t; - input_t QW[$]; // Input Feed Tracing - addr_t QC[$]; - - int unsigned error_cnt = 0; - bit done = 0; - initial begin - // Report testbench details - $display("Testbench - tresholding K=%0d -> N=%0d", K, N); - for(int unsigned c = 0; c < C; c++) begin - $write("Channel #%0d: Thresholds = {", c); - for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0d", THRESHS[c][i]); - $display(" }"); - end - - // Config - s_axilite_AWVALID = 0; - s_axilite_AWADDR = 'x; - s_axilite_WVALID = 0; - s_axilite_WDATA = 'x; - s_axilite_BREADY = 0; - s_axilite_ARVALID = 0; - s_axilite_ARADDR = 'x; - - // Stream Input - ivld = 0; - idat = 'x; - - @(posedge clk iff !rst); - - // Threshold Configuration - for(int unsigned c = 0; c < C; c+=PE) begin - automatic addr_t addr = 0; - if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = c/PE; - for(int unsigned pe = 0; pe < PE; pe++) begin - if(PE > 1) addr[N+:$clog2(PE)] = pe; - for(int unsigned t = 0; t < 2**N-1; t++) begin - addr[0+:N] = t; - fork - begin - s_axilite_AWVALID <= 1; - s_axilite_AWADDR <= { addr, 2'b00 }; - @(posedge clk iff s_axilite_AWREADY); - s_axilite_AWVALID <= 0; - s_axilite_AWADDR <= 'x; - end - begin - s_axilite_WVALID <= 1; - s_axilite_WDATA <= THRESHS[c+pe][t]; - @(posedge clk iff s_axilite_WREADY); - s_axilite_WVALID <= 0; - s_axilite_WDATA <= 'x; - end - begin - s_axilite_BREADY <= 1; - @(posedge clk iff s_axilite_BVALID); - assert(s_axilite_BRESP == '0) else begin - $error("Error on parameter write."); - $stop; - end - s_axilite_BREADY <= 0; - end - join - end - end - end - - fork - // Intermittent configuration readback - while(!done) begin - if(($urandom()%37) != 0) begin - s_axilite_ARVALID <= 0; - s_axilite_ARADDR <= 'x; - @(posedge clk); - end - else begin - automatic addr_t addr = $urandom()%(N-1); - if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE; - if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF; - - s_axilite_ARVALID <= 1; - s_axilite_ARADDR <= { addr, 2'b00 }; - @(posedge clk iff s_axilite_ARREADY); - - QC.push_back(addr); - end - end - - // AXI4Stream MST Writes input values - repeat(MST_STRM_WROUNDS) begin - automatic input_t dat; - - while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk); - - std::randomize(dat); - ivld <= 1; - idat <= dat; - @(posedge clk iff irdy); - ivld <= 0; - idat <= 'x; - QW.push_back(dat); - end - join_any - done <= 1; - repeat(N+6) @(posedge clk); - - assert(QW.size() == 0) else begin - $error("Missing %0d outputs.", QW.size()); - $stop; - end - assert(QC.size() == 0) else begin - $error("Missing %0d readback replies.", QC.size()); - $stop; - end - - $display("Test completed: %0d errors in %0d tests.", error_cnt, MST_STRM_WROUNDS); - $display("========================================="); - $finish; - end - - // Output Checker ------------------------------------------------------- - - // Configuration Readback - always_ff @(posedge clk iff s_axilite_RVALID) begin - assert(s_axilite_RRESP == '0) else begin - $error("Read back error."); - $stop; - end - assert(QC.size()) begin - automatic addr_t addr = QC.pop_front(); - automatic int unsigned cnl = - (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) + - (PE == 1? 0 : addr[N+:$clog2(PE)]); - automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]]; - assert(s_axilite_RDATA == exp) else begin - $error("Readback mismatch on #%0d.%0d: %0d instead of %0d", cnl, addr[0+:N], s_axilite_RDATA, exp); - $stop; - end - end - else begin - $error("Spurious readback output."); - $stop; - end - end - - // Stream Output - int unsigned OCnl = 0; - always @(posedge clk) begin - if(rst) begin - OCnl <= 0; - ordy <= 1'b0; - end - else begin - if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED; - - if(ordy && ovld) begin - assert(QW.size()) begin - automatic input_t x = QW.pop_front(); - - for(int unsigned pe = 0; pe < PE; pe++) begin - automatic int unsigned cnl = OCnl + pe; - - $display("Mapped CNL=%0d DAT=%3d -> #%2d", cnl, x[pe], odat[pe]); - assert( - ((odat[pe] == 0) || (THRESHS[cnl][odat[pe]-1] <= x[pe])) && - ((odat[pe] == 2**N-1) || (x[pe] < THRESHS[cnl][odat[pe]])) - ) else begin - $error("Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", cnl, x[pe], odat[pe]); - error_cnt++; - $stop; - end - end - end - else begin - $error("Spurious output."); - $stop; - end - - OCnl <= (OCnl + PE)%C; - end - end - end - -endmodule: thresholding_axi_tb diff --git a/finn-rtllib/thresholding/sim/thresholding_tb.sv b/finn-rtllib/thresholding/sim/thresholding_tb.sv deleted file mode 100644 index e42145f10e..0000000000 --- a/finn-rtllib/thresholding/sim/thresholding_tb.sv +++ /dev/null @@ -1,274 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, - * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * @brief Testbench for thresholding_axi. - * @author Monica Chiosa - * - */ - -module thresholding_tb #( - int unsigned K = 10, // input precision - int unsigned N = 4, // output precision - int unsigned C = 6, // number of channels - int unsigned PE = 2, - - localparam int unsigned CF = C/PE // Channel Fold -); - localparam bit DEEP_PIPELINE = 1; - - localparam int unsigned MST_STRM_WROUNDS = 507; - localparam bit THROTTLED = 1; - - //----------------------------------------------------------------------- - // Clock and Reset Control - logic clk = 0; - always #5ns clk = !clk; - logic rst = 1; - initial begin - #10ns; - @(posedge clk); - rst <= 0; - end - - //----------------------------------------------------------------------- - // Parallel Instances differing in Data Type - typedef logic [K -1:0] val_t; - typedef val_t threshs_t[C][2**N-1]; - typedef val_t [PE-1:0] input_t; - typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t; - logic [0:2] term = '0; - always_comb begin - if(&term) $finish; - end - for(genvar i = 0; i < 3; i++) begin : genTypes - localparam bit SIGNED = i>0; - localparam bit FPARG = i>1; - - //- DUT ------------------------- - logic cfg_en; - logic cfg_we; - logic [$clog2(C)+N-1:0] cfg_a; - logic [K-1:0] cfg_d; - uwire cfg_rack; - uwire [K-1:0] cfg_q; - - uwire irdy; - logic ivld; - logic [PE-1:0][K-1:0] idat; - - logic ordy = 0; - uwire ovld; - uwire [PE-1:0][N-1:0] odat; - - thresholding #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(SIGNED), .FPARG(FPARG), .USE_CONFIG(1), .DEEP_PIPELINE(DEEP_PIPELINE)) dut ( - .clk, .rst, - - // Configuration - .cfg_en, .cfg_we, .cfg_a, .cfg_d, - .cfg_rack, .cfg_q, - - // Stream Processing - .irdy, .ivld, .idat, - .ordy, .ovld, .odat - ); - - //- Stimulus Driver ------------- - threshs_t THRESHS; - function val_t sigord(input val_t x); - automatic val_t res = x; - if(SIGNED) begin - if(FPARG && x[K-1]) res[K-2:0] = ~x[K-2:0]; - res[K-1] = !x[K-1]; - end - return res; - endfunction : sigord - - input_t QW[$]; // Input tracing - addr_t QC[$]; // Readback tracking - int unsigned error_cnt = 0; - bit done = 0; - initial begin - - // Generate thresholds - std::randomize(THRESHS); - foreach(THRESHS[c]) begin - val_t row[2**N-1] = THRESHS[c]; - row.sort with (sigord(item)); - THRESHS[c] = row; - end - - // Report test case details - $display("[%0d] Thresholding %s%s%0d -> uint%0d", i, SIGNED? "s" : "u", FPARG? "fp" : "int", K, N); - for(int unsigned c = 0; c < C; c++) begin - $write("[%0d] Channel #%0d: Thresholds = {", i, c); - for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0X", THRESHS[c][i]); - $display(" }"); - end - - // Config - cfg_en = 0; - cfg_we = 'x; - cfg_a = 'x; - cfg_d = 'x; - - // Stream Input - ivld = 0; - idat = 'x; - - @(posedge clk iff !rst); - - // Threshold Configuratin - cfg_en <= 1; - cfg_we <= 1; - for(int unsigned c = 0; c < C; c+=PE) begin - if(CF > 1) cfg_a[N+$clog2(PE)+:$clog2(CF)] <= c/PE; - for(int unsigned pe = 0; pe < PE; pe++) begin - if(PE > 1) cfg_a[N+:$clog2(PE)] = pe; - for(int unsigned t = 0; t < 2**N-1; t++) begin - cfg_a[0+:N] <= t; - cfg_d <= THRESHS[c+pe][t]; - @(posedge clk); - end - end - end - cfg_d <= 'x; - - fork - // Intermittent configuration readback - while(!done) begin - cfg_en <= 0; - cfg_we <= 'x; - cfg_a <= 'x; - @(posedge clk); - if(($urandom()%41) == 0) begin - automatic addr_t addr = $urandom()%(N-1); - if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE; - if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF; - - cfg_en <= 1; - cfg_we <= 0; - cfg_a <= addr; - @(posedge clk); - QC.push_back(addr); - end - end - - // AXI4Stream MST Writes input values - repeat(MST_STRM_WROUNDS) begin - automatic input_t dat; - - while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk); - - std::randomize(dat); - ivld <= 1; - idat <= dat; - @(posedge clk iff irdy); - ivld <= 0; - idat <= 'x; - QW.push_back(dat); - end - join_any - done <= 1; - repeat((DEEP_PIPELINE+1)*N+6) @(posedge clk); - - assert(QW.size() == 0) else begin - $error("[%0d] Missing %0d outputs.", i, QW.size()); - $stop; - end - assert(QC.size() == 0) else begin - $error("[%0d] Missing %0d readback replies.", i, QC.size()); - $stop; - end - - $display("[%0d] Test completed: %0d errors in %0d tests.", i, error_cnt, MST_STRM_WROUNDS); - $display("============================================="); - term[i] <= 1; - end - - //- Readback Checker -------------- - always_ff @(posedge clk iff cfg_rack) begin - assert(QC.size()) begin - automatic addr_t addr = QC.pop_front(); - automatic int unsigned cnl = - (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) + - (PE == 1? 0 : addr[N+:$clog2(PE)]); - automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]]; - assert(cfg_q == exp) else begin - $error("[%0d] Readback mismatch on #%0d.%0d: %0d instead of %0d", i, cnl, addr[0+:N], cfg_q, exp); - $stop; - end - end - else begin - $error("[%0d] Spurious readback output.", i); - $stop; - end - end - - // Output Checker - int unsigned OCnl = 0; - always @(posedge clk) begin - if(rst) begin - OCnl <= 0; - ordy <= 1'b0; - end - else begin - if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED; - - if(ordy && ovld) begin - assert(QW.size()) begin - automatic input_t x = QW.pop_front(); - - for(int unsigned pe = 0; pe < PE; pe++) begin - automatic int unsigned cnl = OCnl + pe; - - $display("[%0d] Mapped CNL=%0d DAT=%3x -> #%2d", i, cnl, x[pe], odat[pe]); - assert( - ((odat[pe] == 0) || (sigord(THRESHS[cnl][odat[pe]-1]) <= sigord(x[pe]))) && - ((odat[pe] == 2**N-1) || (sigord(x[pe]) < sigord(THRESHS[cnl][odat[pe]]))) - ) else begin - $error("[%0d] Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", i, cnl, x[pe], odat[pe]); - error_cnt++; - $stop; - end - end - end - else begin - $error("[%0d] Spurious output.", i); - $stop; - end - - OCnl <= (OCnl + PE)%C; - end - end - end - - end : genTypes - -endmodule: thresholding_tb diff --git a/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl b/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl deleted file mode 100644 index 338304fa40..0000000000 --- a/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl +++ /dev/null @@ -1,187 +0,0 @@ - -# Loading additional proc with user specified bodies to compute parameter values. -source [file join [file dirname [file dirname [info script]]] gui/thresholding_axi_v1_0.gtcl] - -# Definitional proc to organize widgets for parameters. -proc init_gui { IPINST } { - ipgui::add_param $IPINST -name "Component_Name" - #Adding Page - set Page_0 [ipgui::add_page $IPINST -name "Page 0"] - ipgui::add_param $IPINST -name "ADDR_BITS" -parent ${Page_0} - ipgui::add_param $IPINST -name "BIAS" -parent ${Page_0} - ipgui::add_param $IPINST -name "C" -parent ${Page_0} - ipgui::add_param $IPINST -name "CF" -parent ${Page_0} - ipgui::add_param $IPINST -name "FPARG" -parent ${Page_0} - ipgui::add_param $IPINST -name "K" -parent ${Page_0} - ipgui::add_param $IPINST -name "N" -parent ${Page_0} - ipgui::add_param $IPINST -name "O_BITS" -parent ${Page_0} - set PE [ipgui::add_param $IPINST -name "PE" -parent ${Page_0}] - set_property tooltip {PE Count} ${PE} - ipgui::add_param $IPINST -name "SIGNED" -parent ${Page_0} - - -} - -proc update_PARAM_VALUE.ADDR_BITS { PARAM_VALUE.ADDR_BITS PARAM_VALUE.C PARAM_VALUE.PE PARAM_VALUE.N } { - # Procedure called to update ADDR_BITS when any of the dependent parameters in the arguments change - - set ADDR_BITS ${PARAM_VALUE.ADDR_BITS} - set C ${PARAM_VALUE.C} - set PE ${PARAM_VALUE.PE} - set N ${PARAM_VALUE.N} - set values(C) [get_property value $C] - set values(PE) [get_property value $PE] - set values(N) [get_property value $N] - set_property value [gen_USERPARAMETER_ADDR_BITS_VALUE $values(C) $values(PE) $values(N)] $ADDR_BITS -} - -proc validate_PARAM_VALUE.ADDR_BITS { PARAM_VALUE.ADDR_BITS } { - # Procedure called to validate ADDR_BITS - return true -} - -proc update_PARAM_VALUE.CF { PARAM_VALUE.CF PARAM_VALUE.C PARAM_VALUE.PE } { - # Procedure called to update CF when any of the dependent parameters in the arguments change - - set CF ${PARAM_VALUE.CF} - set C ${PARAM_VALUE.C} - set PE ${PARAM_VALUE.PE} - set values(C) [get_property value $C] - set values(PE) [get_property value $PE] - set_property value [gen_USERPARAMETER_CF_VALUE $values(C) $values(PE)] $CF -} - -proc validate_PARAM_VALUE.CF { PARAM_VALUE.CF } { - # Procedure called to validate CF - return true -} - -proc update_PARAM_VALUE.O_BITS { PARAM_VALUE.O_BITS PARAM_VALUE.BIAS PARAM_VALUE.N } { - # Procedure called to update O_BITS when any of the dependent parameters in the arguments change - - set O_BITS ${PARAM_VALUE.O_BITS} - set BIAS ${PARAM_VALUE.BIAS} - set N ${PARAM_VALUE.N} - set values(BIAS) [get_property value $BIAS] - set values(N) [get_property value $N] - set_property value [gen_USERPARAMETER_O_BITS_VALUE $values(BIAS) $values(N)] $O_BITS -} - -proc validate_PARAM_VALUE.O_BITS { PARAM_VALUE.O_BITS } { - # Procedure called to validate O_BITS - return true -} - -proc update_PARAM_VALUE.BIAS { PARAM_VALUE.BIAS } { - # Procedure called to update BIAS when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.BIAS { PARAM_VALUE.BIAS } { - # Procedure called to validate BIAS - return true -} - -proc update_PARAM_VALUE.C { PARAM_VALUE.C } { - # Procedure called to update C when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.C { PARAM_VALUE.C } { - # Procedure called to validate C - return true -} - -proc update_PARAM_VALUE.FPARG { PARAM_VALUE.FPARG } { - # Procedure called to update FPARG when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.FPARG { PARAM_VALUE.FPARG } { - # Procedure called to validate FPARG - return true -} - -proc update_PARAM_VALUE.K { PARAM_VALUE.K } { - # Procedure called to update K when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.K { PARAM_VALUE.K } { - # Procedure called to validate K - return true -} - -proc update_PARAM_VALUE.N { PARAM_VALUE.N } { - # Procedure called to update N when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.N { PARAM_VALUE.N } { - # Procedure called to validate N - return true -} - -proc update_PARAM_VALUE.PE { PARAM_VALUE.PE } { - # Procedure called to update PE when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.PE { PARAM_VALUE.PE } { - # Procedure called to validate PE - return true -} - -proc update_PARAM_VALUE.SIGNED { PARAM_VALUE.SIGNED } { - # Procedure called to update SIGNED when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.SIGNED { PARAM_VALUE.SIGNED } { - # Procedure called to validate SIGNED - return true -} - - -proc update_MODELPARAM_VALUE.N { MODELPARAM_VALUE.N PARAM_VALUE.N } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.N}] ${MODELPARAM_VALUE.N} -} - -proc update_MODELPARAM_VALUE.K { MODELPARAM_VALUE.K PARAM_VALUE.K } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.K}] ${MODELPARAM_VALUE.K} -} - -proc update_MODELPARAM_VALUE.C { MODELPARAM_VALUE.C PARAM_VALUE.C } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.C}] ${MODELPARAM_VALUE.C} -} - -proc update_MODELPARAM_VALUE.PE { MODELPARAM_VALUE.PE PARAM_VALUE.PE } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.PE}] ${MODELPARAM_VALUE.PE} -} - -proc update_MODELPARAM_VALUE.SIGNED { MODELPARAM_VALUE.SIGNED PARAM_VALUE.SIGNED } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.SIGNED}] ${MODELPARAM_VALUE.SIGNED} -} - -proc update_MODELPARAM_VALUE.FPARG { MODELPARAM_VALUE.FPARG PARAM_VALUE.FPARG } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.FPARG}] ${MODELPARAM_VALUE.FPARG} -} - -proc update_MODELPARAM_VALUE.BIAS { MODELPARAM_VALUE.BIAS PARAM_VALUE.BIAS } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.BIAS}] ${MODELPARAM_VALUE.BIAS} -} - -proc update_MODELPARAM_VALUE.CF { MODELPARAM_VALUE.CF PARAM_VALUE.CF } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.CF}] ${MODELPARAM_VALUE.CF} -} - -proc update_MODELPARAM_VALUE.ADDR_BITS { MODELPARAM_VALUE.ADDR_BITS PARAM_VALUE.ADDR_BITS } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.ADDR_BITS}] ${MODELPARAM_VALUE.ADDR_BITS} -} - -proc update_MODELPARAM_VALUE.O_BITS { MODELPARAM_VALUE.O_BITS PARAM_VALUE.O_BITS } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.O_BITS}] ${MODELPARAM_VALUE.O_BITS} -} diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 0a6c0b39c9..1796738c58 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -30,7 +30,6 @@ import subprocess import sys import tempfile -from qonnx.util.basic import roundup_to_integer_multiple # test boards test_board_map = ["Pynq-Z1", "KV260_SOM", "ZCU104", "U250"] @@ -77,11 +76,6 @@ alveo_default_platform["U280"] = "xilinx_u280_gen3x16_xdma_1_202211_1" alveo_default_platform["U55C"] = "xilinx_u55c_gen3x16_xdma_3_202210_1" -# Create a joint part map, encompassing other boards too -part_map = {**pynq_part_map, **alveo_part_map} -part_map["VEK280"] = "xcve2802-vsvh1760-2MP-e-S" -part_map["VCK190"] = "xcvc1902-vsva2197-2MP-e-S" - def get_rtlsim_trace_depth(): """Return the trace depth for rtlsim via PyVerilator. Controllable @@ -234,67 +228,3 @@ def is_exe(fpath): return exe_file return None - - -def find_next_power_of_2(n): - """For any integer 'n', find the next greatest power of 2""" - # Negative values will loop infinitely below - return 0 - if n <= 0: - return 0 - # If '1' is requested, output will be '0' in the loop below, avoid this now. - elif n == 1: - return 2 # i.e. 2**1 - - # decrement 'n' (to handle cases when `n` itself is a power of 2) - n = n - 1 - - # loop until only one bit is left - while n & n - 1: - # unset rightmost bit - n = n & n - 1 - return n << 1 - - -mem_primitives_versal = { - "URAM_72x4096": (72, 4096), - "URAM_36x8192": (36, 8192), - "URAM_18x16384": (18, 16384), - "URAM_9x32768": (9, 32768), - "BRAM18_36x512": (36, 512), - "BRAM18_18x1024": (18, 1024), - "BRAM18_9x2048": (9, 2048), - "LUTRAM": (1, 64), -} - - -def get_memutil_alternatives( - req_mem_spec, mem_primitives=mem_primitives_versal, sort_min_waste=True -): - ret = [ - (primitive_name, memutil(req_mem_spec, primitive_spec)) - for (primitive_name, primitive_spec) in mem_primitives.items() - ] - if sort_min_waste: - ret = sorted(ret, key=lambda x: x[1][2]) - return ret - - -def memutil(req_mem_spec, primitive_spec): - """Computes how many instances of a memory primitive are necessary to - implemented a desired memory size, where req_mem_spec is the desired - size and the primitive_spec is the primitve size. The sizes are expressed - as tuples of (mem_width, mem_depth). Returns (primitive_count, efficiency, waste) - where efficiency in range [0,1] indicates how much of the total capacity is - utilized, and waste indicates how many bits of storage are wasted.""" - - req_width, req_depth = req_mem_spec - prim_width, prim_depth = primitive_spec - - match_width = roundup_to_integer_multiple(req_width, prim_width) - match_depth = roundup_to_integer_multiple(req_depth, prim_depth) - count_width = match_width // prim_width - count_depth = match_depth // prim_depth - count = count_depth * count_width - eff = (req_width * req_depth) / (count * prim_width * prim_depth) - waste = (count * prim_width * prim_depth) - (req_width * req_depth) - return (count, eff, waste) diff --git a/tests/util/test_basic.py b/tests/util/test_basic.py deleted file mode 100755 index 97a8c50261..0000000000 --- a/tests/util/test_basic.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (C) 2023, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -import finn.util.basic as basic - - -@pytest.mark.util -def test_next_power_of_2(): - test_vector = [ - {"input": -2, "expected_result": 0}, - {"input": -1, "expected_result": 0}, - {"input": 0, "expected_result": 0}, - {"input": 1, "expected_result": 2}, - {"input": 2, "expected_result": 2}, - {"input": 3, "expected_result": 4}, - {"input": 4, "expected_result": 4}, - {"input": 7, "expected_result": 8}, - {"input": 8, "expected_result": 8}, - {"input": 11, "expected_result": 16}, - {"input": 15, "expected_result": 16}, - {"input": 16, "expected_result": 16}, - {"input": 18, "expected_result": 32}, - {"input": 27, "expected_result": 32}, - {"input": 31, "expected_result": 32}, - {"input": 32, "expected_result": 32}, - {"input": 42, "expected_result": 64}, - {"input": 65, "expected_result": 128}, - ] - - for test_dict in test_vector: - output = basic.find_next_power_of_2(test_dict["input"]) - assert output >= test_dict["input"] - assert output == test_dict["expected_result"] From 01cff8080b95ea777f7ce384e95118cdbf5a901f Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 16:42:06 +0000 Subject: [PATCH 064/291] [TBS] clean up for HLS variant only --- .../fpgadataflow/convert_to_hls_layers.py | 74 ++++++------------- 1 file changed, 21 insertions(+), 53 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py index c43f058fac..ef02453498 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py @@ -1019,10 +1019,9 @@ def apply(self, model): class InferThresholdingLayer(Transformation): """Convert any MultiThreshold into a standalone thresholding HLS layer.""" - def __init__(self, mem_mode="const", use_rtl_variant=False): + def __init__(self, mem_mode="const"): super().__init__() self.mem_mode = mem_mode - self.use_rtl_variant = use_rtl_variant def apply(self, model): graph = model.graph @@ -1074,58 +1073,27 @@ def apply(self, model): ) actval = int(actval) assert (not odt.signed()) or (actval < 0), ( - node.name + ": Signed output requires actval < 0" + node.name + ": Signed output requres actval < 0" + ) + # create and insert new Thresholding_Batch node + new_node = helper.make_node( + "Thresholding_Batch", + [thl_input, thl_threshold], + [thl_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=ifc, + PE=pe, + numSteps=thl_thres_shape[1], + inputDataType=idt.name, + # weightDataType can be tightened by MinimizeAccumulatorWidth + weightDataType=idt.name, + outputDataType=odt.name, + numInputVectors=list(thl_in_shape[:-1]), + ActVal=actval, + mem_mode=self.mem_mode, + name="Thresholding_Batch_" + node.name, ) - - # Ensure that RTL variant is not inserted for unsupported configuration - is_rtl_variant_compatible = True - - # Perform checks for RTL variant if chosen - if self.use_rtl_variant and is_rtl_variant_compatible: - new_node = helper.make_node( - "Thresholding_Binary_Search", - [thl_input, thl_threshold], - [thl_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=ifc, - PE=pe, - numSteps=thl_thres_shape[1], - inputDataType=idt.name, - weightDataType=idt.name, - outputDataType=odt.name, - numInputVectors=list(thl_in_shape[:-1]), - activation_bias=actval, - mem_mode=self.mem_mode, - name="Thresholding_Binary_Search_" + node.name, - ) - else: - if self.use_rtl_variant: - warnings.warn( - """%s : RTL Thresholding requested for unsupported - configuration. Falling back to HLS implementation.""" - % node.name - ) - - # create and insert new Thresholding_Batch node - new_node = helper.make_node( - "Thresholding_Batch", - [thl_input, thl_threshold], - [thl_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=ifc, - PE=pe, - numSteps=thl_thres_shape[1], - inputDataType=idt.name, - weightDataType=idt.name, - outputDataType=odt.name, - numInputVectors=list(thl_in_shape[:-1]), - ActVal=actval, - mem_mode=self.mem_mode, - name="Thresholding_Batch_" + node.name, - ) - graph.node.insert(insert_point, new_node) # remove old node graph.node.remove(node) From b7425284067ff9e3c99553ca05b65b0a22fc5166 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 16:47:00 +0000 Subject: [PATCH 065/291] [TBS] Remove import of thresholding_batch Signed-off-by: aziz bahri --- src/finn/custom_op/fpgadataflow/__init__.py | 2 -- tests/fpgadataflow/test_convert_to_hw_thresholding.py | 4 ---- 2 files changed, 6 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index bd9c0366e7..7697e8765d 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -51,7 +51,6 @@ from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool -from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch from finn.custom_op.fpgadataflow.thresholding import ( Thresholding, ) @@ -64,7 +63,6 @@ # registered and plug in correctly into the infrastructure custom_op["MatrixVectorActivation"] = MatrixVectorActivation custom_op["StreamingFIFO"] = StreamingFIFO -custom_op["Thresholding_Batch"] = Thresholding_Batch custom_op["Thresholding"] = Thresholding custom_op["VectorVectorActivation"] = VectorVectorActivation custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition diff --git a/tests/fpgadataflow/test_convert_to_hw_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py index e96581dc89..c7495dd1e4 100755 --- a/tests/fpgadataflow/test_convert_to_hw_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hw_thresholding.py @@ -39,10 +39,6 @@ from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.basic import gen_finn_dt_tensor -from test_fpgadataflow_thresholding_binary_search import ( - make_single_thresholding_binary_search_modelwrapper, -) - import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls from finn.core.rtlsim_exec import rtlsim_exec from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP From b626ae40e0e97d15cc3f453f49d35e6cd97c8bd5 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 26 Jan 2024 16:49:01 +0000 Subject: [PATCH 066/291] [TBS] Remove batch from layer name Signed-off-by: aziz bahri --- src/finn/transformation/fpgadataflow/convert_to_hw_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 88a9a64cd6..58ff3e7c0c 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -255,7 +255,7 @@ def apply(self, model): numInputVectors=list(thl_in_shape[:-1]), ActVal=actval, mem_mode=self.mem_mode, - name="Thresholding_Batch_" + node.name, + name="Thresholding_" + node.name, ) graph.node.insert(insert_point, new_node) From e5a9ad424196fd762e9a389f7fd7850a5e1a523a Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 29 Jan 2024 11:30:01 +0000 Subject: [PATCH 067/291] [CustomOp] Re-add tlastmarker hls in registry --- src/finn/custom_op/fpgadataflow/hls/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 87611517f1..188f45273c 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -47,9 +47,9 @@ ) from finn.custom_op.fpgadataflow.hls.streamingeltwise_hls import StreamingEltwise_hls from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls +from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls -from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls custom_op = dict() @@ -73,4 +73,5 @@ custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls custom_op["Thresholding_hls"] = Thresholding_hls +custom_op["TLastMarker_hls"] = TLastMarker_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls From cbda16ece4acd7f5fb5aead14a621c2a98dfb47c Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 29 Jan 2024 12:58:15 +0000 Subject: [PATCH 068/291] [CustomOp] Update thresholding new class hierarchy --- .../fpgadataflow/hls/thresholding_hls.py | 152 +----------------- .../custom_op/fpgadataflow/thresholding.py | 140 +++++++++++++--- .../fpgadataflow/specialize_layers.py | 1 + .../test_convert_to_hw_thresholding.py | 37 ++--- .../test_fpgadataflow_thresholding.py | 11 +- 5 files changed, 140 insertions(+), 201 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 0ad198feb5..91a8693761 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,17 +29,15 @@ import numpy as np import os import textwrap -import warnings from math import ceil, log2 -from finn.custom_op.fpgadataflow.thresholding import Thresholding -from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from qonnx.core.datatype import DataType from qonnx.util.basic import ( interleave_matrix_outer_dim_from_partitions, roundup_to_integer_multiple, ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.thresholding import Thresholding from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, @@ -54,32 +52,16 @@ # the ... here can be any shape (representing groups of vectors) -class Thresholding_hls(Thresholding,HLSBackend): +class Thresholding_hls(Thresholding, HLSBackend): """Class that corresponds to finn-hls Thresholding_Batch function.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) - self.variant = "hls" def get_nodeattr_types(self): my_attrs = { - # parallelization; channels thresholded per cycle - "PE": ("i", True, 0), - # number of channels (each may have different thresholds) - "NumChannels": ("i", True, 0), - # number of steps in thresholding function - "numSteps": ("i", True, 1), # string defining memory type "ram_style": ("s", False, "distributed", {"distributed", "block"}), - # FINN DataTypes for inputs, outputs - "inputDataType": ("s", True, ""), - "weightDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), # initialization value for the thresholding accumulator "ActVal": ("i", False, 0), # memory mode for the thresholds @@ -97,7 +79,8 @@ def get_nodeattr_types(self): # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(Thresholding.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs def calc_tmem(self): @@ -106,49 +89,6 @@ def calc_tmem(self): pe = self.get_nodeattr("PE") return mh // pe - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype().name), - str(idt.name), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - # TODO collect automatically from get_nodeattr_types - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("NumChannels") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType") - self.get_nodeattr("outputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append("""The required Threshold_Batch attributes do not exist.""") - - return info_messages - def bram_estimation(self): """Calculates BRAM cost if resource set to BRAM""" style = self.get_nodeattr("ram_style") @@ -180,52 +120,6 @@ def lut_estimation(self): # total cost return comparator_cost + lutram_cost - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_weight_datatype(self): - """Returns FINN DataType of thresholds, here called weights.""" - return DataType[self.get_nodeattr("weightDataType")] - - def minimize_accumulator_width(self, model): - "Minimize threshold width ('accumulator width' here due to convention)" - thresholds = model.get_initializer(self.onnx_node.input[1]) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - min_input = self.get_input_datatype().min() - max_input = self.get_input_datatype().max() - # get range required by threshold values - tdt_min = min(min_input, min_threshold) - tdt_max = max(max_input, max_threshold) - if tdt_min < 0: - if abs(tdt_min) > tdt_max: - tdt = DataType.get_smallest_possible(tdt_min) - else: - tdt = DataType.get_smallest_possible(-tdt_max - 1) - else: - tdt = DataType.get_smallest_possible(tdt_max) - assert np.vectorize(tdt.allowed)( - threshold_tensor - ).all(), "Thresholds can't be expressed with type %s" % str(tdt) - self.set_nodeattr("weightDataType", tdt.name) - # Update QONNX DataType of tensor for consistency - model.set_tensor_datatype(self.onnx_node.input[1], tdt) - return DataType[self.get_nodeattr("weightDataType")] - - def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - return i_bits * self.get_nodeattr("PE") - - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - return o_bits * self.get_nodeattr("PE") - def get_weightstream_width(self): """Returns weight stream width. Used only in decoupled mode.""" if self.get_nodeattr("mem_mode") == "decoupled": @@ -248,36 +142,6 @@ def get_ap_int_max_w(self): weightstream = self.get_weightstream_width() return max([weightstream, temp_value]) - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - fold = ich // pe - vecs = list(self.get_nodeattr("numInputVectors")) - folded_input_shape = tuple(vecs + [fold, pe]) - return folded_input_shape - - def get_folded_output_shape(self, ind=0): - # same shape as input - return self.get_folded_input_shape() - - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_input_shape = tuple(vecs + [ich]) - return normal_input_shape - - def get_normal_output_shape(self, ind=0): - # same shape as input - return self.get_normal_input_shape() - - def get_number_output_values(self): - nf = np.prod(self.get_folded_output_shape()[:-1]) - return nf - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - def get_template_param_values(self): """Returns the template parameter values according to input, output and weight data types.""" @@ -655,13 +519,11 @@ def strm_decl(self): def docompute(self): tmpl_args = self.get_template_param_values() - node = self.onnx_node mem_mode = self.get_nodeattr("mem_mode") if mem_mode == "const": self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} + """Thresholding_Batch (in0_{}, out_{}, threshs, numReps);""".format( - node.op_type, tmpl_args["TSrcI"], tmpl_args["TDstI"], self.hls_sname(), diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index 004bf1aec0..0297d0143b 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -54,25 +54,6 @@ def get_nodeattr_types(self): # [4] is four vectors (like a FC layer with batch=4) # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) "numInputVectors": ("ints", False, [1]), - # name of the top module in verilog template. Used by PyVerilator - # and IPI generation - "gen_top_module": ("s", False, ""), - # bias to be applied to outputs of the node - "activation_bias": ("i", False, 0), - # whether weights (thresholds) will be - # writable through an AXI-lite interface during runtime - # 1 for enabled, 0 for disabled. - "runtime_writeable_weights": ("i", False, 0, {0, 1}), - # memory depth triggers for threshold storage - "depth_trigger_uram": ("i", False, 0), - "depth_trigger_bram": ("i", False, 0), - # enable uniform thres optimization - # doesn't actually do anything yet, only - # for resource estimations - "uniform_thres": ("i", False, 0, {0, 1}), - # enable deep pipelining for easier timing closure - # setting to 0 may save some FFs but otherwise leave on - "deep_pipeline": ("i", False, 1, {0, 1}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -81,11 +62,120 @@ def make_shape_compatible_op(self, model): oshape = self.get_normal_output_shape() return super().make_const_shape_op(oshape) - def verify_node(): - pass - def infer_node_datatype(): - pass - def get_number_output_values(): - pass + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype().name), + str(idt.name), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required Threshold_Batch attributes do not exist.""") + + return info_messages + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_weight_datatype(self): + """Returns FINN DataType of thresholds, here called weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def minimize_accumulator_width(self, model): + "Minimize threshold width ('accumulator width' here due to convention)" + thresholds = model.get_initializer(self.onnx_node.input[1]) + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + min_input = self.get_input_datatype().min() + max_input = self.get_input_datatype().max() + # get range required by threshold values + tdt_min = min(min_input, min_threshold) + tdt_max = max(max_input, max_threshold) + if tdt_min < 0: + if abs(tdt_min) > tdt_max: + tdt = DataType.get_smallest_possible(tdt_min) + else: + tdt = DataType.get_smallest_possible(-tdt_max - 1) + else: + tdt = DataType.get_smallest_possible(tdt_max) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + self.set_nodeattr("weightDataType", tdt.name) + # Update QONNX DataType of tensor for consistency + model.set_tensor_datatype(self.onnx_node.input[1], tdt) + return DataType[self.get_nodeattr("weightDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + fold = ich // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [ich]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + def execute_node(self, context, graph): pass diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 31da3756d3..7fda50c965 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -39,6 +39,7 @@ restricted_layers = [] restricted_layers.append("MatrixVectorActivation") restricted_layers.append("VectorVectorActivation") +restricted_layers.append("Thresholding") def _determine_impl_style(node): diff --git a/tests/fpgadataflow/test_convert_to_hw_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py index c7495dd1e4..dffc5c4642 100755 --- a/tests/fpgadataflow/test_convert_to_hw_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hw_thresholding.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,23 +30,14 @@ import numpy as np from onnx import TensorProto, helper -from pyverilator.util.axi_utils import axilite_write, reset_rtlsim from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.general.multithreshold import multithreshold -from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes -from qonnx.util.basic import gen_finn_dt_tensor -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls -from finn.core.rtlsim_exec import rtlsim_exec -from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers + from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -83,12 +74,8 @@ def make_single_multithresholding_modelwrapper( ): NumChannels = thresholds.shape[0] - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, num_input_vecs + [NumChannels] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, num_input_vecs + [NumChannels] - ) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, num_input_vecs + [NumChannels]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, num_input_vecs + [NumChannels]) node_inp_list = ["inp", "thresh"] @@ -128,7 +115,7 @@ def make_single_multithresholding_modelwrapper( @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) @pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6]) @pytest.mark.parametrize("num_input_channels", [16]) -@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +@pytest.mark.parametrize("impl_style", ["hls"]) # TODO: add rtl later @pytest.mark.fpgadataflow @pytest.mark.vivado def test_convert_multithreshold_to_hardware( @@ -147,8 +134,7 @@ def test_convert_multithreshold_to_hardware( # This implies that it expects a negative activation, BIPOLAR does not provide that if activation == DataType["BIPOLAR"]: pytest.skip( - "Only negative activations are supported for " - "RTL Thresholding Binary Search node" + "Only negative activations are supported for " "RTL Thresholding Binary Search node" ) # Other non-input parameters @@ -160,9 +146,7 @@ def test_convert_multithreshold_to_hardware( activation_bias = output_data_type.min() # Generate random thresholds and sort in ascending order - thresholds = generate_random_threshold_values( - input_data_type, num_input_channels, num_steps - ) + thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) # provide non-decreasing/ascending thresholds thresholds = sort_thresholds_increasing(thresholds) @@ -180,6 +164,5 @@ def test_convert_multithreshold_to_hardware( model = model.transform(InferThresholdingLayer()) model = model.transform(SpecializeLayers()) model = model.transform(InferShapes()) - - node_variant = getCustomOp(model.graph.node[0]).variant - assert (impl_style == node_variant) \ No newline at end of file + # TODO functional verification + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 2b7bc28a10..ca2651a31c 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -51,6 +51,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -65,7 +66,7 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_i node_inp_list = ["inp", "thresh"] Thresholding_node = helper.make_node( - "Thresholding_Batch", + "Thresholding", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -135,6 +136,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): actval = odt.min() model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs) + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -174,9 +176,9 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): if exec_mode == "rtlsim": hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "Thresholding_Batch_0" in hls_synt_res_est + assert "Thresholding_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("Thresholding_Batch")[0] + node = model.get_nodes_by_op_type("Thresholding_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) @@ -212,6 +214,7 @@ def test_runtime_thresholds_single_layer(): actval = odt.min() model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs) + model = model.transform(SpecializeLayers()) op_inst = getCustomOp(model.graph.node[0]) op_inst.set_nodeattr("runtime_writeable_weights", 1) op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat") From 8ab3857258d51c5f57ad36329f793d1858e17cb0 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 29 Jan 2024 13:30:02 +0000 Subject: [PATCH 069/291] [Tests] Fix runtime thresholding test --- tests/fpgadataflow/test_fpgadataflow_thresholding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index ca2651a31c..696ac63c75 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -225,6 +225,7 @@ def test_runtime_thresholds_single_layer(): old_weight_stream = list(old_weight_stream) # need to create stitched IP for runtime weight testing model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) From b1452d2cb2fe769e7f49b0ee0a146e8d6222f1c2 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 29 Jan 2024 17:45:56 +0000 Subject: [PATCH 070/291] [CustomOp] Add execution fct for thresh hw layer --- .../custom_op/fpgadataflow/thresholding.py | 20 ++++++++- .../test_fpgadataflow_thresholding.py | 44 ++++++++++++------- 2 files changed, 45 insertions(+), 19 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index 0297d0143b..6b91735119 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,11 +29,14 @@ import numpy as np import warnings from qonnx.core.datatype import DataType +from qonnx.custom_op.general.multithreshold import multithreshold from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp class Thresholding(HWCustomOp): + """Abstraction layer for HW implementation of Thresholding.""" + def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -178,4 +181,17 @@ def get_exp_cycles(self): return np.prod(self.get_folded_output_shape()[:-1]) def execute_node(self, context, graph): - pass + node = self.onnx_node + inp_values = context[node.input[0]] + th_val = context[node.input[1]] + + y = multithreshold(np.transpose(inp_values, (0, 3, 1, 2)), th_val) + y = y.transpose(0, 2, 3, 1) + act = DataType[self.get_nodeattr("outputDataType")] + if act == DataType["BIPOLAR"]: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += act.min() + context[node.output[0]] = y diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 696ac63c75..43eca7b7c3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -136,6 +136,32 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): actval = odt.min() model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs) + + # calculate reference output + # multithreshold util fxn wants NCHW input, not NHWC + y = multithreshold(np.transpose(x, (0, 3, 1, 2)), T) + # convert back to NHWC for comparison to hw outputs + y = np.transpose(y, (0, 2, 3, 1)) + if act == DataType["BIPOLAR"]: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += act.min() + + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + + # package input data as dictionary + input_dict = {"inp": x} + + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all() + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": @@ -151,28 +177,12 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): else: raise Exception("Unknown exec_mode") - # package input data as dictionary - input_dict = {"inp": x} - - # multithreshold util fxn wants NCHW input, not NHWC - y = multithreshold(np.transpose(x, (0, 3, 1, 2)), T) - # convert back to NHWC for comparison to hw outputs - y = np.transpose(y, (0, 2, 3, 1)) - if act == DataType["BIPOLAR"]: - # binary to bipolar - y = 2 * y - 1 - else: - # signed offset - y += act.min() - - oshape = model.get_tensor_shape("outp") - y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all() if exec_mode == "rtlsim": hls_synt_res_est = model.analysis(hls_synth_res_estimation) From 53094b250756e0f97708595cffc1a417facc9374 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 30 Jan 2024 09:16:31 +0000 Subject: [PATCH 071/291] [CustomOp] Move node attribute in thresholding --- src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py | 2 -- src/finn/custom_op/fpgadataflow/thresholding.py | 2 ++ .../transformation/fpgadataflow/convert_to_hw_layers.py | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 91a8693761..5dcff9aa2b 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -62,8 +62,6 @@ def get_nodeattr_types(self): my_attrs = { # string defining memory type "ram_style": ("s", False, "distributed", {"distributed", "block"}), - # initialization value for the thresholding accumulator - "ActVal": ("i", False, 0), # memory mode for the thresholds # const -- embedded thresholds, default # decoupled -- streaming thresholds with streamer packaged inside IP diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index 6b91735119..1ce059358e 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -57,6 +57,8 @@ def get_nodeattr_types(self): # [4] is four vectors (like a FC layer with batch=4) # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) "numInputVectors": ("ints", False, [1]), + # initialization value for the thresholding accumulator + "ActVal": ("i", False, 0), } my_attrs.update(super().get_nodeattr_types()) return my_attrs diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 58ff3e7c0c..d1d61f0ed5 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -180,12 +180,12 @@ def apply(self, model): model = model.transform(InferDataTypes()) return (model, graph_modified) + class InferThresholdingLayer(Transformation): """Convert any MultiThreshold into a standalone thresholding HLS layer.""" - def __init__(self, mem_mode="const"): + def __init__(self): super().__init__() - self.mem_mode = mem_mode def apply(self, model): graph = model.graph @@ -254,7 +254,6 @@ def apply(self, model): outputDataType=odt.name, numInputVectors=list(thl_in_shape[:-1]), ActVal=actval, - mem_mode=self.mem_mode, name="Thresholding_" + node.name, ) @@ -264,6 +263,8 @@ def apply(self, model): graph_modified = True return (model, graph_modified) + + class InferUpsample(Transformation): """Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour nodes.""" From 07b4d7fbd916a91f4e43f91e95d1ba98ebe23270 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 30 Jan 2024 09:22:27 +0000 Subject: [PATCH 072/291] [CustomOp] Fix linting in registry --- src/finn/custom_op/fpgadataflow/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 7697e8765d..d4c9904fe1 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -51,9 +51,7 @@ from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool -from finn.custom_op.fpgadataflow.thresholding import ( - Thresholding, -) +from finn.custom_op.fpgadataflow.thresholding import Thresholding from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation From c600189ab13e0bb5cfb0d024322eb70f90636e53 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 30 Jan 2024 11:00:23 +0000 Subject: [PATCH 073/291] [rtllib] Clean up fifo and fmpadding wrapper --- finn-rtllib/fifo/hdl/fifo_template.v | 8 ++++---- finn-rtllib/fmpadding/hdl/fmpadding_template.v | 11 ++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/finn-rtllib/fifo/hdl/fifo_template.v b/finn-rtllib/fifo/hdl/fifo_template.v index 4c614c83dd..3f14ae991f 100644 --- a/finn-rtllib/fifo/hdl/fifo_template.v +++ b/finn-rtllib/fifo/hdl/fifo_template.v @@ -31,12 +31,12 @@ module $TOP_MODULE_NAME$( //- Global Control ------------------ -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET = ap_rst_n" *) +(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) input ap_clk, -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) +(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) input ap_rst_n, - output $COUNT_RANGE$ count, output $COUNT_RANGE$ maxcount, @@ -55,7 +55,7 @@ Q_srl #( .depth($DEPTH$), .width($WIDTH$) ) -$TOP_MODULE_NAME$_impl +impl ( .clock(ap_clk), .reset(!ap_rst_n), diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_template.v b/finn-rtllib/fmpadding/hdl/fmpadding_template.v index 0b0f40f86a..2347d9b394 100644 --- a/finn-rtllib/fmpadding/hdl/fmpadding_template.v +++ b/finn-rtllib/fmpadding/hdl/fmpadding_template.v @@ -31,10 +31,11 @@ module $TOP_MODULE_NAME$( //- Global Control ------------------ -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) -input ap_clk, -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) -input ap_rst_n, +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite, ASSOCIATED_RESET = ap_rst_n" *) +(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) +input ap_clk, +(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) +input ap_rst_n, //- AXI Lite ------------------------ // Writing @@ -86,7 +87,7 @@ fmpadding_axi #( .INIT_YOFF($INIT_YOFF$), .INIT_YEND($INIT_YEND$) ) -$TOP_MODULE_NAME$_impl +impl ( .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), From 88af9b7f74b3fb04c3b56af6d116672469e86f7f Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 30 Jan 2024 16:40:33 +0000 Subject: [PATCH 074/291] [CustomOps] First clean up over hls code generation functions --- .../fpgadataflow/hls/addstreams_hls.py | 3 - .../fpgadataflow/hls/channelwise_op_hls.py | 3 - .../fpgadataflow/hls/checksum_hls.py | 3 - .../custom_op/fpgadataflow/hls/concat_hls.py | 3 - .../hls/convolutioninputgenerator_hls.py | 37 ------------- .../fpgadataflow/hls/downsampler_hls.py | 37 ------------- .../fpgadataflow/hls/duplicatestreams_hls.py | 25 --------- .../fpgadataflow/hls/fmpadding_hls.py | 37 ------------- .../fpgadataflow/hls/fmpadding_pixel_hls.py | 55 ++++++------------- .../fpgadataflow/hls/globalaccpool_hls.py | 34 ------------ .../custom_op/fpgadataflow/hls/iodma_hls.py | 6 -- .../fpgadataflow/hls/labelselect_hls.py | 12 ---- .../custom_op/fpgadataflow/hls/lookup_hls.py | 3 - .../custom_op/fpgadataflow/hls/pool_hls.py | 12 ---- .../hls/streamingdatawidthconverter_hls.py | 28 ---------- .../fpgadataflow/hls/streamingeltwise_hls.py | 3 - .../fpgadataflow/hls/streamingmaxpool_hls.py | 37 ------------- .../fpgadataflow/hls/thresholding_hls.py | 3 - .../fpgadataflow/hls/tlastmarker_hls.py | 3 - .../fpgadataflow/hls/upsampler_hls.py | 37 ------------- src/finn/custom_op/fpgadataflow/hlsbackend.py | 46 ++++++++++++---- .../custom_op/fpgadataflow/streamingfifo.py | 24 +------- .../fpgadataflow/insert_fifo.py | 8 ++- .../fpgadataflow/set_fifo_depths.py | 2 + .../test_fpgadataflow_checksum.py | 2 + .../fpgadataflow/test_fpgadataflow_concat.py | 1 + 26 files changed, 68 insertions(+), 396 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py index 1a40970b77..4f7b58d8e1 100644 --- a/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py @@ -251,9 +251,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, diff --git a/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py index e7c263c084..a698acfe49 100644 --- a/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py @@ -408,9 +408,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, diff --git a/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py index 23818621b9..8a72ca3c6c 100644 --- a/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py @@ -305,9 +305,6 @@ def dataoutstrm(self): 'cnpy::npy_save("%s/output_checksum.npy",&checksum[0],{1},"w");' % code_gen_dir, ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """using T = ap_uint;\n void {}(hls::stream &in0_{}, diff --git a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py index f608b343f6..94e0c3626c 100644 --- a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py @@ -265,9 +265,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): n_inputs = self.get_n_inputs() in_streams = [] diff --git a/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py index 7223996e8b..5e0dbfd396 100644 --- a/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py @@ -526,31 +526,6 @@ def defines(self, var): ) ] - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( @@ -660,9 +635,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): if self.use_parallel_window_output(): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ @@ -679,12 +651,3 @@ def blackboxfunction(self): self.onnx_node.name, self.hls_sname(), self.hls_sname() ) ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py index d5bd0877a4..ff9a83d091 100644 --- a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py @@ -72,31 +72,6 @@ def defines(self, var): batch_size = self.get_nodeattr("numInputVectors") self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( @@ -146,9 +121,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits @@ -163,15 +135,6 @@ def blackboxfunction(self): ) ] - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py index de0fadb26c..e19149435e 100644 --- a/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py @@ -196,28 +196,6 @@ def global_includes(self): def defines(self, var): self.code_gen_dict["$DEFINES$"] = [] - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - def strm_decl(self): n_outputs = self.get_num_output_streams() self.code_gen_dict["$STREAMDECLARATIONS$"] = [] @@ -275,9 +253,6 @@ def dataoutstrm(self): self.code_gen_dict["$DATAOUTSTREAM$"] = outstrm_code - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): n_outputs = self.get_num_output_streams() inp_streams = [] diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py index 3b0b870e23..d21b672b73 100644 --- a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py @@ -100,31 +100,6 @@ def defines(self, var): ) ] - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( @@ -193,9 +168,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits @@ -210,15 +182,6 @@ def blackboxfunction(self): ) ] - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py index e1393dc96e..62942c4f28 100644 --- a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py @@ -70,31 +70,17 @@ def defines(self, var): ) ] - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) ) def docompute(self): @@ -104,8 +90,8 @@ def docompute(self): hls_call = "FMPadding_Pixel_Nonsquare" self.code_gen_dict["$DOCOMPUTE$"] = [ """{} (in0, out);""".format( - hls_call, in_t + SIMD, {}> (in0_{}, out_{});""".format( + hls_call, in_t, self.hls_sname(), self.hls_sname() ) ] @@ -125,36 +111,31 @@ def dataoutstrm(self): oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' % ( packed_hls_type, elem_hls_type, elem_bits, npy_type, + self.hls_sname(), oshape_cpp_str, npy_out, ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" - % (self.onnx_node.name, packed_hls_type, packed_hls_type) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") diff --git a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py index 93398b1dc9..8df18e8b8a 100644 --- a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py @@ -150,28 +150,6 @@ def global_includes(self): def defines(self, var): self.code_gen_dict["$DEFINES$"] = [] - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( @@ -223,9 +201,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, @@ -237,12 +212,3 @@ def blackboxfunction(self): self.hls_sname(), ) ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py index a0701b8989..bdc313f592 100644 --- a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py @@ -413,12 +413,6 @@ def execute_node(self, context, graph): def dataoutstrm(self): pass - def read_npy_data(self): - pass - - def save_as_npy(self): - pass - def strm_decl(self): pass diff --git a/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py index 701d061987..cce45eb742 100644 --- a/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py @@ -236,9 +236,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, @@ -251,12 +248,3 @@ def blackboxfunction(self): self.hls_sname(), ) ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py index 885d3039a4..e51db9a811 100644 --- a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py @@ -141,9 +141,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( diff --git a/src/finn/custom_op/fpgadataflow/hls/pool_hls.py b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py index 2baaad01a7..05bb8fbd74 100644 --- a/src/finn/custom_op/fpgadataflow/hls/pool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py @@ -183,9 +183,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): packed_ibits = self.get_instream_width() packed_in_hls_type = "ap_uint<%d>" % packed_ibits @@ -203,15 +200,6 @@ def blackboxfunction(self): ) ] - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py index be096e63c7..7b656a0120 100644 --- a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py @@ -71,31 +71,6 @@ def defines(self, var): self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth) self.code_gen_dict["$DEFINES$"].append("#define NumLCMToOut %d" % (numLCMToOut)) - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( @@ -162,9 +137,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): in_packed_bits = self.get_instream_width() in_packed_hls_type = "ap_uint<%d>" % in_packed_bits diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py index 2aec40f988..8528986d89 100644 --- a/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py @@ -306,9 +306,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py index eb3284a343..b742e1f73b 100755 --- a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py @@ -95,31 +95,6 @@ def defines(self, var): ) ] - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( @@ -190,9 +165,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits @@ -207,15 +179,6 @@ def blackboxfunction(self): ) ] - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 5dcff9aa2b..fb90365eef 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -575,9 +575,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): if self.get_nodeattr("mem_mode") == "const": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ diff --git a/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py index c2ed06f832..2e908016e7 100644 --- a/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py @@ -188,9 +188,6 @@ def docompute(self): def dataoutstrm(self): self.code_gen_dict["$DATAOUTSTREAM$"] = [] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): dyn_iters = self.get_nodeattr("DynIters") diff --git a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py index 89a474a5d3..e52081edf2 100644 --- a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py @@ -74,31 +74,6 @@ def defines(self, var): batch_size = self.get_nodeattr("numInputVectors") self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( @@ -157,9 +132,6 @@ def dataoutstrm(self): ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits @@ -174,15 +146,6 @@ def blackboxfunction(self): ) ] - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py index 403b992a05..f5fd8a1094 100644 --- a/src/finn/custom_op/fpgadataflow/hlsbackend.py +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -363,11 +363,32 @@ def defines(self, var): added.""" pass - @abstractmethod def read_npy_data(self): """Function to generate the commands for reading data from .npy file in c++, - is member function of HLSBackend class but has to be filled by every node.""" - pass + might need to be overwritten depending on custom op.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) @abstractmethod def strm_decl(self): @@ -390,11 +411,9 @@ def dataoutstrm(self): by every node.""" pass - @abstractmethod def save_as_npy(self): - """Function to generate the commands for saving data in .npy file in c++, - is member function of HLSBackend class but has to be filled by every node.""" - pass + """Function to generate the commands for saving data in .npy file in c++""" + self.code_gen_dict["$SAVEASCNPY$"] = [] @abstractmethod def blackboxfunction(self): @@ -403,11 +422,16 @@ def blackboxfunction(self): by every node.""" pass - @abstractmethod def pragmas(self): - """Function to generate the pragma commands in c++, is member function of - HLSBackend class but has to be filled by every node.""" - pass + """Function to generate the pragma commands in c++, + might need to be overwritten depending on custom op.""" + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") def get_ap_int_max_w(self): """Return the maximum width of any ap_int used in this module. Used to set the diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index 950574ba0a..b55af929ed 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -45,6 +45,8 @@ def get_nodeattr_types(self): "depth": ("i", True, 0), # folded shape of input/output "folded_shape": ("ints", True, []), + # normal shape of input/output + "normal_shape": ("ints", True, []), # FINN DataTypes for inputs/outputs "dataType": ("s", True, ""), # FPGA resource type for FIFOs when impl_style is vivado @@ -105,27 +107,7 @@ def get_normal_input_shape(self, ind=0): assert depth >= 2, """Depth is too low""" if depth > 256 and self.get_nodeattr("impl_style") == "rtl": warnings.warn("Depth is high, set between 2 and 256 for efficient SRL implementation") - # derive normal shape from folded shape - # StreamingFIFOs are inserted in between fpgadataflow nodes - # the folded shape could be for example (1, nf, pe) - # with nf (neuron folding): mh // pe - # the normal input shape is in this case (1, mh) - # so to achieve this the two inner dimensions are multiplied - # and together with all previous dimensions - # this gives the normal input shape - - folded_shape = self.get_nodeattr("folded_shape") - # extract inner dimension - inner_dim = folded_shape[-1] - # multiply with the next inner dimension - folding_factor = folded_shape[-2] * inner_dim - normal_ishape = [] - # create the normal_ishape - for i in range(len(folded_shape) - 2): - normal_ishape.append(folded_shape[i]) - normal_ishape.append(folding_factor) - - return normal_ishape + return self.get_nodeattr("normal_shape") def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index de555d4101..4efadf0f27 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -127,6 +127,7 @@ def apply(self, model): folded output shape of the first node is not the same as the folded output shape of the second node. A streaming fifo can't be implemented in between these nodes.""" + n_shape = n0.get_normal_output_shape() # check if outFIFOdepths attribute of first node # and inFIFOdepths attribute of consumer node is equal @@ -162,6 +163,7 @@ def apply(self, model): backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=str(dtype.name), impl_style=impl_style, ram_style=self.vivado_ram_style, @@ -188,6 +190,7 @@ def apply(self, model): n0 = getCustomOp(first_node) # determine fifo node attributes fld_shape = n0.get_folded_input_shape(inp_ind) + n_shape = n0.get_normal_input_shape(inp_ind) dtype = n0.get_input_datatype(inp_ind) fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind] @@ -196,7 +199,7 @@ def apply(self, model): fifo_output_tensor = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, - n0.get_normal_input_shape(), + n0.get_normal_input_shape(inp_ind), ) graph.value_info.append(fifo_output_tensor) model.set_tensor_datatype(fifo_output_tensor.name, dtype) @@ -213,6 +216,7 @@ def apply(self, model): backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=str(dtype.name), impl_style=impl_style, ram_style=self.vivado_ram_style, @@ -243,6 +247,7 @@ def apply(self, model): out_ind = list(final_node.output).index(graph_out_name) # determine fifo node attributes fld_shape = n0.get_folded_output_shape(out_ind) + n_shape = n0.get_normal_output_shape(out_ind) dtype = n0.get_output_datatype(out_ind) fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind] @@ -268,6 +273,7 @@ def apply(self, model): backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=str(dtype.name), impl_style=impl_style, ram_style=self.vivado_ram_style, diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 72b5e495a4..11ffc965b6 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -568,6 +568,7 @@ def apply(self, model): cfgs = get_fifo_split_configs(depth, self.max_qsrl_depth, self.max_vivado_depth) if len(cfgs) > 1: fld_shape = n_inst.get_folded_output_shape() + n_shape = n_inst.get_normal_output_shape() dtype = n_inst.get_nodeattr("dataType") ram_style = n_inst.get_nodeattr("ram_style") shape = model.get_tensor_shape(node.input[0]) @@ -593,6 +594,7 @@ def apply(self, model): backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=dtype, impl_style=impl_style, ram_style=ram_style, diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py index 5cdd99f1e4..71d4d60c06 100644 --- a/tests/fpgadataflow/test_fpgadataflow_checksum.py +++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py @@ -49,6 +49,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -175,6 +176,7 @@ def test_fpgadataflow_checksum(): # rtlsim model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py index b4d8a04a95..b52b14fca3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_concat.py +++ b/tests/fpgadataflow/test_fpgadataflow_concat.py @@ -145,6 +145,7 @@ def test_fpgadataflow_concat_stitchedip(): assert model.graph.node[0].op_type == "StreamingConcat_hls" assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls" model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(fpga_part, clk_ns)) model = model.transform(HLSSynthIP()) From 8416358ffd78c15e745cfa9d57180b8dccf58099 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 30 Jan 2024 17:17:55 +0000 Subject: [PATCH 075/291] [CustomOp] Move stream declaration for hls code into hlsbackend --- .../fpgadataflow/hls/channelwise_op_hls.py | 13 ------------- .../hls/convolutioninputgenerator_hls.py | 13 ------------- .../custom_op/fpgadataflow/hls/downsampler_hls.py | 13 ------------- .../custom_op/fpgadataflow/hls/fmpadding_hls.py | 13 ------------- .../fpgadataflow/hls/fmpadding_pixel_hls.py | 13 ------------- .../custom_op/fpgadataflow/hls/globalaccpool_hls.py | 13 ------------- src/finn/custom_op/fpgadataflow/hls/iodma_hls.py | 3 --- .../custom_op/fpgadataflow/hls/labelselect_hls.py | 13 ------------- src/finn/custom_op/fpgadataflow/hls/lookup_hls.py | 13 ------------- src/finn/custom_op/fpgadataflow/hls/pool_hls.py | 13 ------------- .../fpgadataflow/hls/streamingmaxpool_hls.py | 13 ------------- .../custom_op/fpgadataflow/hls/upsampler_hls.py | 13 ------------- src/finn/custom_op/fpgadataflow/hlsbackend.py | 13 +++++++++++-- 13 files changed, 11 insertions(+), 148 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py index a698acfe49..14efa113dd 100644 --- a/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py @@ -344,19 +344,6 @@ def read_npy_data(self): ) ) - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): tmpl_args = self.get_template_param_values() # TODO: why put some template parameters into defines and not others? diff --git a/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py index 5e0dbfd396..585f152550 100644 --- a/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py @@ -526,19 +526,6 @@ def defines(self, var): ) ] - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): ram_style = self.get_nodeattr("ram_style") map_to_hls_ram_style = { diff --git a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py index ff9a83d091..71db77ef6c 100644 --- a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py @@ -72,19 +72,6 @@ def defines(self, var): batch_size = self.get_nodeattr("numInputVectors") self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D" sname = self.hls_sname() diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py index d21b672b73..b7ad5b1120 100644 --- a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py @@ -100,19 +100,6 @@ def defines(self, var): ) ] - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): in_t = self.get_input_datatype().get_hls_datatype_str() idim_h, idim_w = self.get_nodeattr("ImgDim") diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py index 62942c4f28..8ce9f79a6e 100644 --- a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py @@ -70,19 +70,6 @@ def defines(self, var): ) ] - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): in_t = self.get_input_datatype().get_hls_datatype_str() odim_h, odim_w = self.get_padded_odim() diff --git a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py index 8df18e8b8a..657528be7c 100644 --- a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py @@ -150,19 +150,6 @@ def global_includes(self): def defines(self, var): self.code_gen_dict["$DEFINES$"] = [] - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): self.code_gen_dict["$DOCOMPUTE$"] = [ """AccPool_Batch<{}, {}, {}, {}, {}> (in0_{}, out_{}, 1);""".format( diff --git a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py index bdc313f592..9644ab2098 100644 --- a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py @@ -413,9 +413,6 @@ def execute_node(self, context, graph): def dataoutstrm(self): pass - def strm_decl(self): - pass - def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() if self.get_nodeattr("direction") == "out": diff --git a/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py index cce45eb742..634d9de55a 100644 --- a/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py @@ -185,19 +185,6 @@ def read_npy_data(self): ) ) - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): self.code_gen_dict["$DOCOMPUTE$"] = [ """LabelSelect_Batch<{}, {}, {}, {}, {} > (in0_{}, out_{}, 1);""".format( diff --git a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py index e51db9a811..feeca8719b 100644 --- a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py @@ -141,19 +141,6 @@ def dataoutstrm(self): ) ] - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): mem_mode = self.get_nodeattr("mem_mode") if mem_mode == "const": diff --git a/src/finn/custom_op/fpgadataflow/hls/pool_hls.py b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py index 05bb8fbd74..64c6ec33f8 100644 --- a/src/finn/custom_op/fpgadataflow/hls/pool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py @@ -110,19 +110,6 @@ def read_npy_data(self): ) ) - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): idt = self.get_input_datatype() i_hls_dt = idt.get_hls_datatype_str() diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py index b742e1f73b..61c9ef3a3e 100755 --- a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py @@ -95,19 +95,6 @@ def defines(self, var): ) ] - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): dtype = self.get_input_datatype() if dtype.bitwidth() == 1: diff --git a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py index e52081edf2..f57d3f7237 100644 --- a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py @@ -74,19 +74,6 @@ def defines(self, var): batch_size = self.get_nodeattr("numInputVectors") self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - def docompute(self): is_2d = self.get_nodeattr("DimMode") == 0 batch = self.get_nodeattr("numInputVectors") diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py index f5fd8a1094..0324b66f47 100644 --- a/src/finn/custom_op/fpgadataflow/hlsbackend.py +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -390,12 +390,21 @@ def read_npy_data(self): ) ) - @abstractmethod def strm_decl(self): """Function to generate the commands for the stream declaration in c++, is member function of HLSBackend class but has to be filled by every node.""" - pass + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) @abstractmethod def docompute(self): From 3a0da243b022ab021174e61b04834211b998e09d Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 30 Jan 2024 17:44:43 +0000 Subject: [PATCH 076/291] [CustomOp] Move dataout cpp function to hlsbackend --- .../fpgadataflow/hls/addstreams_hls.py | 25 ------------- .../custom_op/fpgadataflow/hls/concat_hls.py | 25 ------------- .../fpgadataflow/hls/downsampler_hls.py | 29 --------------- .../fpgadataflow/hls/fmpadding_hls.py | 29 --------------- .../fpgadataflow/hls/fmpadding_pixel_hls.py | 29 --------------- .../fpgadataflow/hls/globalaccpool_hls.py | 25 ------------- .../custom_op/fpgadataflow/hls/iodma_hls.py | 3 -- .../fpgadataflow/hls/labelselect_hls.py | 25 ------------- .../hls/streamingdatawidthconverter_hls.py | 28 --------------- .../fpgadataflow/hls/streamingeltwise_hls.py | 25 ------------- .../fpgadataflow/hls/streamingmaxpool_hls.py | 28 --------------- .../fpgadataflow/hls/upsampler_hls.py | 29 --------------- src/finn/custom_op/fpgadataflow/hlsbackend.py | 36 +++++++++++++++---- 13 files changed, 30 insertions(+), 306 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py index 4f7b58d8e1..a3f0e043f8 100644 --- a/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py @@ -226,31 +226,6 @@ def docompute(self): ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, diff --git a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py index 94e0c3626c..008fa9cee8 100644 --- a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py @@ -240,31 +240,6 @@ def docompute(self): ) self.code_gen_dict["$DOCOMPUTE$"] = [comp_call] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - def blackboxfunction(self): n_inputs = self.get_n_inputs() in_streams = [] diff --git a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py index 71db77ef6c..56f472b9c0 100644 --- a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py @@ -28,7 +28,6 @@ import numpy as np import os -from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.downsampler import DownSampler from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend @@ -80,34 +79,6 @@ def docompute(self): IFMDim, SIMD,Stride> (in0_{sname}, out_{sname}, numReps);""" ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - def blackboxfunction(self): packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py index b7ad5b1120..d57699af05 100644 --- a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py @@ -28,7 +28,6 @@ import numpy as np import os -from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.fmpadding import FMPadding from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend @@ -127,34 +126,6 @@ def docompute(self): ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - def blackboxfunction(self): packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py index 8ce9f79a6e..b7ba301fbc 100644 --- a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py @@ -29,7 +29,6 @@ import numpy as np import os -from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.fmpadding_pixel import FMPadding_Pixel from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend @@ -82,34 +81,6 @@ def docompute(self): ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - def blackboxfunction(self): packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits diff --git a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py index 657528be7c..9b2a7b25b0 100644 --- a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py @@ -163,31 +163,6 @@ def docompute(self): ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, diff --git a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py index 9644ab2098..8d9903f0f5 100644 --- a/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py @@ -410,9 +410,6 @@ def pragmas(self): def execute_node(self, context, graph): pass - def dataoutstrm(self): - pass - def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() if self.get_nodeattr("direction") == "out": diff --git a/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py index 634d9de55a..1e2c0d034a 100644 --- a/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py @@ -198,31 +198,6 @@ def docompute(self): ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py index 7b656a0120..d1f58d3e87 100644 --- a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py @@ -109,34 +109,6 @@ def docompute(self): % (op, self.hls_sname(), self.hls_sname()) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - def blackboxfunction(self): in_packed_bits = self.get_instream_width() in_packed_hls_type = "ap_uint<%d>" % in_packed_bits diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py index 8528986d89..0d618d832a 100644 --- a/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py @@ -281,31 +281,6 @@ def docompute(self): ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py index 61c9ef3a3e..69db7b4606 100755 --- a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py @@ -124,34 +124,6 @@ def docompute(self): % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname()) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - def blackboxfunction(self): packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits diff --git a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py index f57d3f7237..05d26eddb2 100644 --- a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py @@ -27,7 +27,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np -from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour @@ -91,34 +90,6 @@ def docompute(self): % (self.hls_sname(), self.hls_sname()) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - oshape_cpp_str, - npy_out, - ) - ] - def blackboxfunction(self): packed_bits = self.get_instream_width() packed_hls_type = "ap_uint<%d>" % packed_bits diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py index 0324b66f47..846894d85c 100644 --- a/src/finn/custom_op/fpgadataflow/hlsbackend.py +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -392,8 +392,8 @@ def read_npy_data(self): def strm_decl(self): """Function to generate the commands for the stream declaration in c++, - is member function of HLSBackend class but has to be filled - by every node.""" + is member function of HLSBackend class but might need to be filled + by node.""" self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream> in0_{} ("in0_{}");'.format( @@ -413,12 +413,36 @@ def docompute(self): by every node.""" pass - @abstractmethod def dataoutstrm(self): """Function to generate the commands for reading out data from c++ and convert - into npy format, is member function of HLSBackend class but has to be filled - by every node.""" - pass + into npy format, is member function of HLSBackend class might need to be filled + by node.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] def save_as_npy(self): """Function to generate the commands for saving data in .npy file in c++""" From bf5de4d00d6a1a8cfa9055f33a37d3a86f62f0e7 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Tue, 30 Jan 2024 19:13:59 +0000 Subject: [PATCH 077/291] Revert "[TBS] Clean up branch for HLS variant only" This reverts commit ac1478dac5774ec5d4e599213e37b19ca0ab8967. --- finn-rtllib/thresholding/component.xml | 1002 +++++++++++++++++ .../gui/thresholding_axi_v1_0.gtcl | 4 + finn-rtllib/thresholding/hdl/axilite_if.v | 210 ++++ finn-rtllib/thresholding/hdl/thresholding.sv | 357 ++++++ .../thresholding/hdl/thresholding_axi.sv | 164 +++ .../hdl/thresholding_template_wrapper.v | 120 ++ finn-rtllib/thresholding/sim/thresh_gen.sv | 45 + finn-rtllib/thresholding/sim/thresholding.tcl | 17 + .../thresholding/sim/thresholding_axi_tb.sv | 314 ++++++ .../thresholding/sim/thresholding_tb.sv | 274 +++++ .../xgui/thresholding_axi_v1_0.tcl | 187 +++ src/finn/util/basic.py | 70 ++ tests/util/test_basic.py | 60 + 13 files changed, 2824 insertions(+) create mode 100644 finn-rtllib/thresholding/component.xml create mode 100644 finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl create mode 100644 finn-rtllib/thresholding/hdl/axilite_if.v create mode 100644 finn-rtllib/thresholding/hdl/thresholding.sv create mode 100644 finn-rtllib/thresholding/hdl/thresholding_axi.sv create mode 100644 finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v create mode 100644 finn-rtllib/thresholding/sim/thresh_gen.sv create mode 100644 finn-rtllib/thresholding/sim/thresholding.tcl create mode 100644 finn-rtllib/thresholding/sim/thresholding_axi_tb.sv create mode 100644 finn-rtllib/thresholding/sim/thresholding_tb.sv create mode 100644 finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl create mode 100755 tests/util/test_basic.py diff --git a/finn-rtllib/thresholding/component.xml b/finn-rtllib/thresholding/component.xml new file mode 100644 index 0000000000..e28a3a2c2d --- /dev/null +++ b/finn-rtllib/thresholding/component.xml @@ -0,0 +1,1002 @@ + + + amd.com + finn + thresholding_axi + 1.0 + + + ap_clk + + + + + + + CLK + + + ap_clk + + + + + + ASSOCIATED_RESET + ap_rst_n + + + ASSOCIATED_BUSIF + s_axilite:s_axis:m_axis + + + FREQ_TOLERANCE_HZ + -1 + + + + + m_axis + + + + + + + TDATA + + + m_axis_tdata + + + + + TVALID + + + m_axis_tvalid + + + + + TREADY + + + m_axis_tready + + + + + + s_axis + + + + + + + TDATA + + + s_axis_tdata + + + + + TVALID + + + s_axis_tvalid + + + + + TREADY + + + s_axis_tready + + + + + + s_axilite + + + + + + + + + AWADDR + + + s_axilite_AWADDR + + + + + AWVALID + + + s_axilite_AWVALID + + + + + AWREADY + + + s_axilite_AWREADY + + + + + WDATA + + + s_axilite_WDATA + + + + + WSTRB + + + s_axilite_WSTRB + + + + + WVALID + + + s_axilite_WVALID + + + + + WREADY + + + s_axilite_WREADY + + + + + BRESP + + + s_axilite_BRESP + + + + + BVALID + + + s_axilite_BVALID + + + + + BREADY + + + s_axilite_BREADY + + + + + ARADDR + + + s_axilite_ARADDR + + + + + ARVALID + + + s_axilite_ARVALID + + + + + ARREADY + + + s_axilite_ARREADY + + + + + RDATA + + + s_axilite_RDATA + + + + + RRESP + + + s_axilite_RRESP + + + + + RVALID + + + s_axilite_RVALID + + + + + RREADY + + + s_axilite_RREADY + + + + + + ap_rst_n + + + + + + + RST + + + ap_rst_n + + + + + + POLARITY + ACTIVE_LOW + + + + + + + s_axilite + s_axilite + + reg0 + reg0 + 0x0 + 4096 + 32 + register + + + + + + + xilinx_anylanguagesynthesis + Synthesis + :vivado.xilinx.com:synthesis + Verilog + thresholding_axi_wrapper + + xilinx_anylanguagesynthesis_view_fileset + + + + viewChecksum + fd0bd85b + + + + + xilinx_anylanguagebehavioralsimulation + Simulation + :vivado.xilinx.com:simulation + Verilog + thresholding_axi_wrapper + + xilinx_anylanguagebehavioralsimulation_view_fileset + + + + viewChecksum + fd0bd85b + + + + + xilinx_xpgui + UI Layout + :vivado.xilinx.com:xgui.ui + + xilinx_xpgui_view_fileset + + + + viewChecksum + fc6b9b63 + + + + + xilinx_utilityxitfiles + Utility XIT/TTCL + :vivado.xilinx.com:xit.util + + xilinx_utilityxitfiles_view_fileset + + + + viewChecksum + 8b0215cd + + + + + + + ap_clk + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + ap_rst_n + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_AWVALID + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_AWREADY + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_AWADDR + + in + + 5 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_WVALID + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_WREADY + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_WDATA + + in + + 31 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_WSTRB + + in + + 3 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 1 + + + + + s_axilite_BVALID + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_BREADY + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_BRESP + + out + + 1 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_ARVALID + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_ARREADY + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_ARADDR + + in + + 5 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_RVALID + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_RREADY + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + s_axilite_RDATA + + out + + 31 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axilite_RRESP + + out + + 1 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axis_tready + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axis_tvalid + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + s_axis_tdata + + in + + 15 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 0 + + + + + m_axis_tready + + in + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + 1 + + + + + m_axis_tvalid + + out + + + std_logic + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + m_axis_tdata + + out + + 7 + 0 + + + + std_logic_vector + xilinx_anylanguagesynthesis + xilinx_anylanguagebehavioralsimulation + + + + + + + + N + N + 4 + + + K + K + 16 + + + C + C + 1 + + + PE + Pe + 1 + + + SIGNED + Signed + true + + + FPARG + Fparg + false + + + BIAS + Bias + 0 + + + CF + Cf + 1 + + + ADDR_BITS + Addr Bits + 6 + + + O_BITS + O Bits + 4 + + + + + + choice_list_9d8b0d81 + ACTIVE_HIGH + ACTIVE_LOW + + + + + xilinx_anylanguagesynthesis_view_fileset + + hdl/thresholding.sv + systemVerilogSource + + + hdl/thresholding_axi.sv + systemVerilogSource + + + hdl/thresholding_axi_wrapper.v + verilogSource + CHECKSUM_7b8c102d + + + hdl/axilite_if.v + verilogSource + CHECKSUM_69d1ba26 + xil_defaultlib + + + + xilinx_anylanguagebehavioralsimulation_view_fileset + + hdl/thresholding.sv + systemVerilogSource + + + hdl/thresholding_axi.sv + systemVerilogSource + + + hdl/thresholding_axi_wrapper.v + verilogSource + + + hdl/axilite_if.v + verilogSource + USED_IN_ipstatic + xil_defaultlib + + + + xilinx_xpgui_view_fileset + + xgui/thresholding_axi_v1_0.tcl + tclSource + CHECKSUM_fc6b9b63 + XGUI_VERSION_2 + + + + xilinx_utilityxitfiles_view_fileset + + gui/thresholding_axi_v1_0.gtcl + GTCL + + + + MultiThreshold + + + N + Output Precision + 4 + + + K + Input Precision + 16 + + + C + Channels + 1 + + + PE + Pe + 1 + + + SIGNED + Signed Inputs + true + + + FPARG + Floating-Point Inputs + false + + + BIAS + Bias + 0 + + + CF + Channel Fold + 1 + + + + false + + + + + + ADDR_BITS + Address Bits + 6 + + + + false + + + + + + O_BITS + Output Value Width + 4 + + + + false + + + + + + Component_Name + thresholding_axi_wrapper_v1_0 + + + + + + virtex7 + qvirtex7 + versal + kintex7 + kintex7l + qkintex7 + qkintex7l + akintex7 + artix7 + artix7l + aartix7 + qartix7 + zynq + qzynq + azynq + spartan7 + aspartan7 + virtexu + zynquplus + virtexuplus + virtexuplusHBM + virtexuplus58g + kintexuplus + artixuplus + kintexu + + + /UserIP + + thresholding_axi + level_1 + package_project + 2 + + user.org:user:thresholding_axi_wrapper:1.0 + + 2023-06-27T05:47:20Z + + + + + + 2022.2 + + + + + + + + + + + + + + diff --git a/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl b/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl new file mode 100644 index 0000000000..90d73ede7e --- /dev/null +++ b/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl @@ -0,0 +1,4 @@ +# This file is automatically written. Do not modify. +proc gen_USERPARAMETER_CF_VALUE {C PE } {expr $C/$PE} +proc gen_USERPARAMETER_ADDR_BITS_VALUE {C PE N } {expr int(ceil(log($C/$PE)/log(2))+ceil(log($PE)/log(2))+$N+2)} +proc gen_USERPARAMETER_O_BITS_VALUE {BIAS N } {expr int(ceil($BIAS >= 0? log(pow(2,$N)+$BIAS)/log(2) : 1+log(-$BIAS >= pow(2,$N-1)? -$BIAS : pow(2,$N)+$BIAS)/log(2)))} diff --git a/finn-rtllib/thresholding/hdl/axilite_if.v b/finn-rtllib/thresholding/hdl/axilite_if.v new file mode 100644 index 0000000000..bdd4de288e --- /dev/null +++ b/finn-rtllib/thresholding/hdl/axilite_if.v @@ -0,0 +1,210 @@ +/* + Copyright (c) 2020, Xilinx + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of FINN nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +module axi4lite_if +#( + parameter ADDR_WIDTH = 32, + parameter DATA_WIDTH = 32,//AXI4 spec requires this to be strictly 32 or 64 + parameter IP_DATA_WIDTH = 64//can be any power-of-2 multiple of DATA_WIDTH +) +( +//system signals +input aclk, +input aresetn,//active low, asynchronous assertion and synchronous deassertion + +//Write channels +//write address +output reg awready, +input awvalid, +input [ADDR_WIDTH-1:0] awaddr, +input [2:0] awprot, +//write data +output reg wready, +input wvalid, +input [DATA_WIDTH-1:0] wdata, +input [(DATA_WIDTH/8)-1:0] wstrb, +//burst response +input bready, +output reg bvalid, +output reg [1:0] bresp,//NOTE: 00 = OKAY, 10 = SLVERR (write error) + +//Read channels +//read address +output reg arready, +input arvalid, +input [ADDR_WIDTH-1:0] araddr, +input [2:0] arprot, +//read data +input rready, +output reg rvalid, +output reg [1:0] rresp,//NOTE: 00 = OKAY, 10 = SLVERR (read error) +output reg [DATA_WIDTH-1:0] rdata, + +//IP-side interface +output reg ip_en, +output reg ip_wen, +output reg [ADDR_WIDTH-1:0] ip_addr, +output [IP_DATA_WIDTH-1:0] ip_wdata, +input ip_rack, +input [IP_DATA_WIDTH-1:0] ip_rdata +); + +localparam RESP_OKAY = 2'b00; +localparam RESP_SLVERR = 2'b10; +//get ceil(log2(ceil(IP_DATA_WIDTH/DATA_WIDTH))) +localparam NFOLDS_LOG = $clog2((IP_DATA_WIDTH + DATA_WIDTH - 1) / DATA_WIDTH); + +reg internal_ren; +reg internal_wen; +reg internal_wack; +reg [ADDR_WIDTH-1:0] internal_raddr; +reg [ADDR_WIDTH-1:0] internal_waddr; +reg [DATA_WIDTH-1:0] internal_wdata; +wire [DATA_WIDTH-1:0] internal_rdata; +reg internal_error = 0; + +//check DATA_WIDTH +initial begin + if(DATA_WIDTH != 32 & DATA_WIDTH != 64) begin + $display("AXI4Lite DATA_WIDTH must be 32 or 64"); + $finish; + end +end + +//transaction state machine +localparam STATE_IDLE = 0, + STATE_READ = 1, + STATE_WRITE = 2; + +reg [1:0] state; + +always @(posedge aclk or negedge aresetn) + if(~aresetn) + state <= STATE_IDLE; + else case(state) + STATE_IDLE: + if(awvalid & wvalid) + state <= STATE_WRITE; + else if(arvalid) + state <= STATE_READ; + STATE_READ: + if(rvalid & rready) + state <= STATE_IDLE; + STATE_WRITE: + if(bvalid & bready) + state <= STATE_IDLE; + default: state <= STATE_IDLE; + endcase + +//write-related internal signals +always @(*) begin + internal_waddr = awaddr >> $clog2(DATA_WIDTH/8); + internal_wdata = wdata; + internal_wen = (state == STATE_IDLE) & awvalid & wvalid; +end + +always @(posedge aclk) begin + awready <= internal_wen; + wready <= internal_wen; +end + +//read-related internal signals +always @(*) begin + internal_raddr = araddr >> $clog2(DATA_WIDTH/8); + internal_ren = (state == STATE_IDLE) & ~internal_wen & arvalid; +end + +always @(posedge aclk) + arready <= internal_ren; + +wire write_to_last_fold; + +always @(posedge aclk) begin + ip_wen <= write_to_last_fold; + ip_en <= internal_ren | write_to_last_fold; + if(internal_ren | write_to_last_fold) + ip_addr <= internal_ren ? (internal_raddr >> NFOLDS_LOG) : (internal_waddr >> NFOLDS_LOG); + internal_wack <= internal_wen; +end + +genvar i; +reg [(1<> (internal_rfold*DATA_WIDTH); + always @(posedge aclk) + if(internal_ren) + internal_rfold <= internal_raddr[NFOLDS_LOG-1:0]; + for(i=0; i<(1< + * + * @description + * Produces the N-bit count of those among 2^N-1 thresholds that are not + * larger than the corresponding input: + * y = Σ(T_i <= x) + * The result is computed by binary search. The runtime-configurable + * thresholds must be written in ascending order: + * i < j => T_i < T_j + * The design supports channel folding allowing each input to be processed + * with respect to a selectable set of thresholds. The corresponding + * threshold configuration relies on a channel address prefix. Inputs are + * accompanied by a channel selector. + * + * Parameter Layout as seen on AXI-Lite (row by row): + * | Base \ Offs | 0 1 2 ... 2^N-2 2^N-1 + * ---------+--------------------------------+------------------------------------ + * Chnl #0 | 0 | T_0 T_1 T_2 ... T_{2^N-2} 'x + * Chnl #1 | 2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x + * Chnl #c | ((c/PE)*$clog2(PE) + c%PE)*2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x + * + *****************************************************************************/ +module thresholding #( + int unsigned N, // output precision + int unsigned K, // input/threshold precision + int unsigned C, // number of channels + int unsigned PE, // parallel processing elements + + bit SIGNED = 1, // signed inputs + bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + // Initial Thresholds + parameter THRESHOLDS_PATH = "", + bit USE_CONFIG = 1, + + // Force Use of On-Chip Memory Blocks + int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio) + int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM + bit DEEP_PIPELINE = 0, + + localparam int unsigned CF = C/PE, // Channel fold + localparam int unsigned O_BITS = BIAS >= 0? + /* unsigned */ $clog2(2**N+BIAS) : + /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) +)( + // Global Control + input logic clk, + input logic rst, + + // Threshold Configuration + input logic cfg_en, + input logic cfg_we, + input logic [$clog2(CF)+$clog2(PE)+N-1:0] cfg_a, + input logic [K-1:0] cfg_d, + output logic cfg_rack, + output logic [K-1:0] cfg_q, + + // Input Stream + output logic irdy, + input logic ivld, + input logic [PE-1:0][K-1:0] idat, + + // Output Stream + input logic ordy, + output logic ovld, + output logic [PE-1:0][O_BITS-1:0] odat +); + + // Parameter Constraints Checking + initial begin + if(CF*PE != C) begin + $error("Parallelism PE=%0d is not a multiple of channel count C=%0d.", PE, C); + $finish; + end + end + + // Operations within Pipeline + typedef enum logic [1:0] { + NOP = 2'b00, // No operation + TH = 2'b01, // Thresholding + WR = 2'b11, // Write (initialization) + RB = 2'b10, // Readback (validation) + CFG = 2'b1x // Config op (pointer-preserving) + } op_e; + + // Pipeline Link Type + typedef logic [$clog2(CF)+N-1:0] ptr_t; + typedef logic [K -1:0] val_t; + typedef struct packed { + op_e op; + ptr_t ptr; // WR/RB: address; TH: result + val_t val; // WR/RB: threshold value; TH: input value + } pipe_t; + + //----------------------------------------------------------------------- + // Pipeline Feed + // - configuration always takes precedence + // - number of pending thresholding ops capped to N+3 + // across pipeline and output FIFO: pipe:N + A:1 + B:1 + 1 + localparam int unsigned MAX_PENDING = (DEEP_PIPELINE+1)*N + 3; + pipe_t pipe[PE][N+1]; + if(1) begin : blkFeed + + // Thresholding Input Guard ensuring Output FIFO is never overrun + logic signed [$clog2(MAX_PENDING):0] GuardSem = MAX_PENDING-1; // MAX_PENDING-1, ..., 0, -1 + uwire th_full = GuardSem[$left(GuardSem)]; + always_ff @(posedge clk) begin + if(rst) GuardSem <= MAX_PENDING-1; + else begin + automatic logic dec = !(USE_CONFIG && cfg_en) && !th_full && ivld; + automatic logic inc = ovld && ordy; + GuardSem <= GuardSem + (inc == dec? 0 : inc? 1 : -1); + end + end + + // PE Configuration Address Decoding + uwire cfg_sel[PE]; + if(PE == 1) assign cfg_sel[0] = 1; + else begin + for(genvar pe = 0; pe < PE; pe++) begin + assign cfg_sel[pe] = USE_CONFIG && cfg_en && (cfg_a[N+:$clog2(PE)] == pe); + end + end + + uwire ptr_t iptr; + assign iptr[0+:N] = cfg_a[0+:N]; + if(CF > 1) begin + // Channel Fold Rotation + logic [$clog2(CF)-1:0] CnlCnt = 0; + logic CnlLst = 0; + always_ff @(posedge clk) begin + if(rst) begin + CnlCnt <= 0; + CnlLst <= 0; + end + else if(!(USE_CONFIG && cfg_en) && !th_full && ivld) begin + CnlCnt <= CnlCnt + (CnlLst? 1-CF : 1); + CnlLst <= CnlCnt == CF-2; + end + end + + assign iptr[N+:$clog2(CF)] = USE_CONFIG && cfg_en? cfg_a[N+$clog2(PE)+:$clog2(CF)] : CnlCnt; + end + + for(genvar pe = 0; pe < PE; pe++) begin + assign pipe[pe][0] = '{ + op: USE_CONFIG && cfg_en? + (!cfg_sel[pe]? NOP : cfg_we? WR : RB) : + (ivld && !th_full? TH : NOP), + ptr: iptr, + val: !(USE_CONFIG && cfg_en)? idat[pe] : cfg_we? cfg_d : 0 + }; + end + + assign irdy = !(USE_CONFIG && cfg_en) && !th_full; + end : blkFeed + + //----------------------------------------------------------------------- + // Free-Running Thresholding Pipeline + for(genvar stage = 0; stage < N; stage++) begin : genStages + + localparam int unsigned SN = N-1-stage; + for(genvar pe = 0; pe < PE; pe++) begin : genPE + uwire pipe_t p = pipe[pe][stage]; + uwire cs = (p.ptr[SN:0] == 2**SN-1); + + // Threshold Memory + val_t Thresh; // Read-out register + if(1) begin : blkThresh + localparam int unsigned DEPTH = CF * 2**stage; + localparam RAM_STYLE = + DEPTH_TRIGGER_URAM && (DEPTH >= DEPTH_TRIGGER_URAM)? "ultra" : + DEPTH_TRIGGER_BRAM && (DEPTH >= DEPTH_TRIGGER_BRAM)? "block" : + // If BRAM trigger defined, force distributed memory below if Vivado may be tempted to use BRAM nonetheless. + DEPTH_TRIGGER_BRAM && (DEPTH >= 64)? "distributed" : "auto"; + + (* RAM_STYLE = RAM_STYLE *) + val_t Threshs[DEPTH]; + if(THRESHOLDS_PATH != "") begin + initial $readmemh($sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage), Threshs); + end + + if(USE_CONFIG) begin : genThreshMem + uwire we = (p.op ==? WR) && cs; + if((CF == 1) && (stage == 0)) begin + always @(posedge clk) begin + if(we) Threshs[0] <= p.val; + end + end + else begin + uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1]; + always @(posedge clk) begin + if(we) Threshs[addr] <= p.val; + end + end + end : genThreshMem + + if((CF == 1) && (stage == 0)) begin + assign Thresh = Threshs[0]; + end + else begin + uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1]; + always_ff @(posedge clk) begin + Thresh <= Threshs[addr]; + end + end + + end : blkThresh + + // Pipeline State + pipe_t P = '{ op: NOP, default: 'x }; + logic Reval = 0; + always_ff @(posedge clk) begin + if(rst) begin + P <= '{ op: NOP, default: 'x }; + Reval <= 0; + end + else begin + P <= p; + Reval <= (p.op ==? RB) && cs; + end + end + + logic cmp; + if(!SIGNED) assign cmp = $unsigned(Thresh) <= $unsigned(P.val); + else if(!FPARG) assign cmp = $signed(Thresh) <= $signed(P.val); + else begin : blkSignedFloat + uwire mag_eq = Thresh[K-2:0] == P.val[K-2:0]; + uwire mag_le = Thresh[K-2:0] <= P.val[K-2:0]; + always_comb begin + unique case({Thresh[K-1], P.val[K-1]}) + 2'b00: cmp = mag_le; + 2'b01: cmp = 0; + 2'b10: cmp = 1; + 2'b11: cmp = !mag_le || mag_eq; + default: cmp = 'x; + endcase + end + end : blkSignedFloat + + // Pipeline State Update + pipe_t pp; + always_comb begin + pp = P; + if(P.op !=? CFG) pp.ptr[SN] = cmp; + if(Reval) pp.val = Thresh; + end + + // Pipeline State Forward (potentially additional register) + pipe_t pf; + if(!DEEP_PIPELINE) assign pf = pp; + else begin + pipe_t Pf = '{ op: NOP, default: 'x }; + always_ff @(posedge clk) begin + if(rst) Pf <= '{ op: NOP, default: 'x }; + else Pf <= pp; + end + assign pf = Pf; + end + + assign pipe[pe][stage+1] = pf; + + end : genPE + end : genStages + + //----------------------------------------------------------------------- + // Configuration Readback + always_comb begin + cfg_rack = 0; + cfg_q = 0; + foreach(pipe[pe]) begin + automatic pipe_t p = pipe[pe][N]; + cfg_rack |= p.op ==? RB; + cfg_q |= p.val; + end + end + + //----------------------------------------------------------------------- + // Stream Output through FIFO + // - Depth of N + Output Reg to allow pipe to drain entirely under backpressure + // - Typically mapped to an SRL shift register + if(1) begin : blkStreamOutput + localparam int unsigned A_DEPTH = MAX_PENDING - 1; + logic [PE-1 : 0][N-1 : 0] ADat[A_DEPTH]; + logic signed [$clog2(A_DEPTH):0] APtr = '1; // -1, 0, 1, ..., A_DEPTH-1 + uwire avld = !APtr[$left(APtr)]; + + logic [PE-1:0][N-1:0] BDat = 'x; + logic BVld = 0; + + uwire aload = pipe[0][N].op ==? TH; + uwire bload = !BVld || ordy; + + always_ff @(posedge clk) begin + if(aload) begin + assert(APtr < $signed(A_DEPTH-1)) else begin + $error("Overrun after failing stream guard."); + $stop; + end + foreach(pipe[pe]) ADat[0][pe] <= pipe[pe][N].ptr; + for(int unsigned i = 1; i < A_DEPTH; i++) ADat[i] <= ADat[i-1]; + end + end + always_ff @(posedge clk) begin + if(rst) APtr <= '1; + else APtr <= APtr + (aload == (avld && bload)? 0 : aload? 1 : -1); + end + always_ff @(posedge clk) begin + if(rst) begin + BDat <= 'x; + BVld <= 0; + end + else if(bload) begin + BDat <= ADat[APtr]; + BVld <= avld; + end + end + + assign ovld = BVld; + for(genvar pe = 0; pe < PE; pe++) begin + assign odat[pe] = BDat[pe] + BIAS; + end + end : blkStreamOutput + +endmodule : thresholding diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv new file mode 100644 index 0000000000..1f235b9486 --- /dev/null +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -0,0 +1,164 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief All-AXI interface adapter for thresholding module. + * @author Thomas B. Preußer + * + * @description + * This AXI adapter fits the core thresholding functionality: + * - with AXI stream data interfaces with flow control + * - with implicit round-robin channel rotation as used by FINN, and + * - performs aligned byte address to parameter word address translation. + *****************************************************************************/ + +module thresholding_axi #( + int unsigned N, // output precision + int unsigned K, // input/threshold precision + int unsigned C = 1, // Channels + int unsigned PE = 1, // Processing Parallelism, requires C = k*PE + + bit SIGNED = 1, // signed inputs + bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + // Initial Thresholds + parameter THRESHOLDS_PATH = "", + + bit USE_AXILITE, // Implement AXI-Lite for threshold read/write + + // Force Use of On-Chip Memory Blocks + int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio) + int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM + bit DEEP_PIPELINE = 0, + + localparam int unsigned CF = C/PE, // Channel Fold + localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2, + localparam int unsigned O_BITS = BIAS >= 0? + /* unsigned */ $clog2(2**N+BIAS) : + /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) +)( + //- Global Control ------------------ + input logic ap_clk, + input logic ap_rst_n, + + //- AXI Lite ------------------------ + // Writing + input logic s_axilite_AWVALID, + output logic s_axilite_AWREADY, + input logic [ADDR_BITS-1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored + + input logic s_axilite_WVALID, + output logic s_axilite_WREADY, + input logic [31:0] s_axilite_WDATA, + input logic [ 3:0] s_axilite_WSTRB, + + output logic s_axilite_BVALID, + input logic s_axilite_BREADY, + output logic [1:0] s_axilite_BRESP, + + // Reading + input logic s_axilite_ARVALID, + output logic s_axilite_ARREADY, + input logic [ADDR_BITS-1:0] s_axilite_ARADDR, + + output logic s_axilite_RVALID, + input logic s_axilite_RREADY, + output logic [31:0] s_axilite_RDATA, + output logic [ 1:0] s_axilite_RRESP, + + //- AXI Stream - Input -------------- + output logic s_axis_tready, + input logic s_axis_tvalid, + input logic [((PE*K+7)/8)*8-1:0] s_axis_tdata, + + //- AXI Stream - Output ------------- + input logic m_axis_tready, + output logic m_axis_tvalid, + output logic [((PE*O_BITS+7)/8)*8-1:0] m_axis_tdata +); + + //----------------------------------------------------------------------- + // AXI-lite Configuration Interface + uwire cfg_en; + uwire cfg_we; + uwire [ADDR_BITS-3:0] cfg_a; + uwire [K -1:0] cfg_d; + uwire cfg_rack; + uwire [K -1:0] cfg_q; + + if(USE_AXILITE) begin + uwire [ADDR_BITS-1:0] cfg_a0; + axi4lite_if #(.ADDR_WIDTH(ADDR_BITS), .DATA_WIDTH(32), .IP_DATA_WIDTH(K)) axi ( + .aclk(ap_clk), .aresetn(ap_rst_n), + + .awready(s_axilite_AWREADY), .awvalid(s_axilite_AWVALID), .awaddr(s_axilite_AWADDR), .awprot('x), + .wready(s_axilite_WREADY), .wvalid(s_axilite_WVALID), .wdata(s_axilite_WDATA), .wstrb(s_axilite_WSTRB), + .bready(s_axilite_BREADY), .bvalid(s_axilite_BVALID), .bresp(s_axilite_BRESP), + + .arready(s_axilite_ARREADY), .arvalid(s_axilite_ARVALID), .araddr(s_axilite_ARADDR), .arprot('x), + .rready(s_axilite_RREADY), .rvalid(s_axilite_RVALID), .rresp(s_axilite_RRESP), .rdata(s_axilite_RDATA), + + .ip_en(cfg_en), .ip_wen(cfg_we), .ip_addr(cfg_a0), .ip_wdata(cfg_d), + .ip_rack(cfg_rack), .ip_rdata(cfg_q) + ); + assign cfg_a = cfg_a0[ADDR_BITS-3:0]; + always_ff @(posedge ap_clk) begin + assert(!ap_rst_n || !cfg_en || (cfg_a0[ADDR_BITS-2+:2] === 3'h0)) else begin + $error("%m: Spurious high address bits."); + $stop; + end + end + end + else begin + assign cfg_en = 0; + assign cfg_we = 'x; + assign cfg_a = 'x; + assign cfg_d = 'x; + end + + //----------------------------------------------------------------------- + // Kernel Implementation + thresholding #( + .N(N), .K(K), .C(C), .PE(PE), + .SIGNED(SIGNED), .FPARG(FPARG), .BIAS(BIAS), + .THRESHOLDS_PATH(THRESHOLDS_PATH), .USE_CONFIG(USE_AXILITE), + .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM), + .DEEP_PIPELINE(DEEP_PIPELINE) + ) impl ( + .clk(ap_clk), .rst(!ap_rst_n), + + .cfg_en, .cfg_we, .cfg_a, .cfg_d, + .cfg_rack, .cfg_q, + + .irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat(s_axis_tdata), + .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata) + ); + +endmodule : thresholding_axi diff --git a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v new file mode 100644 index 0000000000..ef76a23cbc --- /dev/null +++ b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v @@ -0,0 +1,120 @@ +/** + * Copyright (c) 2023, Xilinx + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of FINN nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @author Thomas B. Preußer + * @brief Verilog wrapper for IP packaging. + */ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter N = $N$, // output precision + parameter K = $M$, // input/threshold precision + parameter C = $C$, // Channels + parameter PE = $PE$, // Processing Parallelism, requires C = k*PE + + parameter SIGNED = $SIGNED$, // signed inputs + parameter FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + parameter BIAS = $BIAS$, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + parameter THRESHOLDS_PATH = $THRESHOLDS_PATH$, // Directory with initial threshold data + parameter USE_AXILITE = $USE_AXILITE$, // Implement AXI-Lite for threshold read/write + + // Force Use of On-Chip Memory Blocks + parameter DEPTH_TRIGGER_URAM = $DEPTH_TRIGGER_URAM$, // if non-zero, local mems of this depth or more go into URAM (prio) + parameter DEPTH_TRIGGER_BRAM = $DEPTH_TRIGGER_BRAM$, // if non-zero, local mems of this depth or more go into BRAM + parameter DEEP_PIPELINE = $DEEP_PIPELINE$, // [bit] extra pipeline stages for easier timing closure + + parameter O_BITS = $O_BITS$ +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axilite:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + //- AXI Lite ------------------------ + // Writing + input s_axilite_AWVALID, + output s_axilite_AWREADY, + input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored + + input s_axilite_WVALID, + output s_axilite_WREADY, + input [31:0] s_axilite_WDATA, + input [ 3:0] s_axilite_WSTRB, + + output s_axilite_BVALID, + input s_axilite_BREADY, + output [1:0] s_axilite_BRESP, + + // Reading + input s_axilite_ARVALID, + output s_axilite_ARREADY, + input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_ARADDR, + + output s_axilite_RVALID, + input s_axilite_RREADY, + output [31:0] s_axilite_RDATA, + output [ 1:0] s_axilite_RRESP, + + //- AXI Stream - Input -------------- + output in0_V_TREADY, + input in0_V_TVALID, + input [((PE*K+7)/8)*8-1:0] in0_V_TDATA, + + //- AXI Stream - Output ------------- + input out_V_TREADY, + output out_V_TVALID, + output [((PE*O_BITS+7)/8)*8-1:0] out_V_TDATA +); + + thresholding_axi #( + .N(N), .K(K), .C(C), .PE(PE), + .SIGNED(SIGNED), + .FPARG(FPARG), + .BIAS(BIAS), + .THRESHOLDS_PATH(THRESHOLDS_PATH), + .USE_AXILITE(USE_AXILITE), + .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), + .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM), + .DEEP_PIPELINE(DEEP_PIPELINE) + ) core ( + .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), + + .s_axilite_AWVALID(s_axilite_AWVALID), .s_axilite_AWREADY(s_axilite_AWREADY), .s_axilite_AWADDR(s_axilite_AWADDR), + .s_axilite_WVALID(s_axilite_WVALID), .s_axilite_WREADY(s_axilite_WREADY), .s_axilite_WDATA(s_axilite_WDATA), .s_axilite_WSTRB(s_axilite_WSTRB), + .s_axilite_BVALID(s_axilite_BVALID), .s_axilite_BREADY(s_axilite_BREADY), .s_axilite_BRESP(s_axilite_BRESP), + + .s_axilite_ARVALID(s_axilite_ARVALID), .s_axilite_ARREADY(s_axilite_ARREADY), .s_axilite_ARADDR(s_axilite_ARADDR), + .s_axilite_RVALID(s_axilite_RVALID), .s_axilite_RREADY(s_axilite_RREADY), .s_axilite_RDATA(s_axilite_RDATA), .s_axilite_RRESP(s_axilite_RRESP), + .s_axis_tready(in0_V_TREADY), .s_axis_tvalid(in0_V_TVALID), .s_axis_tdata(in0_V_TDATA), + .m_axis_tready(out_V_TREADY), .m_axis_tvalid(out_V_TVALID), .m_axis_tdata(out_V_TDATA) + ); + +endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/thresholding/sim/thresh_gen.sv b/finn-rtllib/thresholding/sim/thresh_gen.sv new file mode 100644 index 0000000000..713723aafa --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresh_gen.sv @@ -0,0 +1,45 @@ +module thresh_gen; + localparam int unsigned K = 9; + localparam int unsigned N = 4; + localparam int unsigned C = 6; + + typedef logic [K-1:0] thresh_t; + localparam thresh_t THRESHOLDS[C][2**N-1] = '{ + '{ 'h00, 'h01, 'h02, 'h03, 'h04, 'h05, 'h06, 'h07, 'h08, 'h09, 'h0a, 'h0b, 'h0c, 'h0d, 'h0e }, + '{ 'h10, 'h11, 'h12, 'h13, 'h14, 'h15, 'h16, 'h17, 'h18, 'h19, 'h1a, 'h1b, 'h1c, 'h1d, 'h1e }, + '{ 'h20, 'h21, 'h22, 'h23, 'h24, 'h25, 'h26, 'h27, 'h28, 'h29, 'h2a, 'h2b, 'h2c, 'h2d, 'h2e }, + '{ 'h30, 'h31, 'h32, 'h33, 'h34, 'h35, 'h36, 'h37, 'h38, 'h39, 'h3a, 'h3b, 'h3c, 'h3d, 'h3e }, + '{ 'h40, 'h41, 'h42, 'h43, 'h44, 'h45, 'h46, 'h47, 'h48, 'h49, 'h4a, 'h4b, 'h4c, 'h4d, 'h4e }, + '{ 'h50, 'h51, 'h52, 'h53, 'h54, 'h55, 'h56, 'h57, 'h58, 'h59, 'h5a, 'h5b, 'h5c, 'h5d, 'h5e } + }; + localparam THRESHOLDS_PATH = "./"; + + localparam int unsigned PE = 2; + localparam int unsigned CF = C/PE; + + for(genvar stage = 0; stage < N; stage++) begin + localparam int unsigned SN = N-1-stage; + for(genvar pe = 0; pe < PE; pe++) begin + initial begin + automatic string file = $sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage); + + automatic thresh_t threshs[CF * 2**stage]; + for(int unsigned c = 0; c < CF; c++) begin + for(int unsigned i = 0; i < 2**stage; i++) begin + threshs[(c << stage) + i] = THRESHOLDS[c*PE + pe][(i<<(N-stage)) + 2**SN-1]; + end + end + + $writememh(file, threshs); + end + end + end + + // Quit after running all initializers + initial begin + #1ns; + $display("Generation done."); + $finish; + end + +endmodule : thresh_gen diff --git a/finn-rtllib/thresholding/sim/thresholding.tcl b/finn-rtllib/thresholding/sim/thresholding.tcl new file mode 100644 index 0000000000..82dc59deb1 --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresholding.tcl @@ -0,0 +1,17 @@ +create_project -force thresholding thresholding.vivado -part xcvc1902-vsva2197-2MP-e-S +set_property board_part xilinx.com:vck190:part0:2.2 [current_project] + +read_verilog hdl/axilite_if.v +read_verilog -sv { hdl/thresholding.sv hdl/thresholding_axi.sv } + +set simset [current_fileset -simset] +set_property -name xsim.simulate.log_all_signals -value true -objects $simset +set_property -name xsim.simulate.runtime -value all -objects $simset +add_files -fileset $simset { sim/thresholding_tb.sv sim/thresholding_axi_tb.sv } + +foreach top { thresholding_tb thresholding_axi_tb } { + set_property top $top $simset + + launch_simulation + close_sim +} diff --git a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv new file mode 100644 index 0000000000..918f539d15 --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv @@ -0,0 +1,314 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for thresholding_axi. + * @author Monica Chiosa + * + */ + +module thresholding_axi_tb #( + int unsigned N = 4, // output precision + int unsigned C = 6, // number of channels + int unsigned PE = 2, + real M0 = 7.3, // slope of the uniform thresholding line + real B0 = 3.1, // offset of the uniform thresholding line + bit THROTTLED = 1, + + localparam int unsigned CF = C/PE, // Channel Fold + localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2 +); + + //----------------------------------------------------------------------- + // Design Geometry + + // For each channel = [0,channel): + // M_channel = M0 + CX*channel + // B_channel = B0 + CX*channel + // Input/threshold precision computed according with the maximum posible value + localparam real CX = 1.375; + localparam int unsigned K = $clog2((2**N-1)*(M0+C*CX) + (B0+C*CX)); // unused sign + magnitude + localparam int unsigned C_BITS = C < 2? 1 : $clog2(C); + + localparam int unsigned MST_STRM_WROUNDS = 503; + + typedef int unsigned threshs_t[C][2**N-1]; + function threshs_t init_thresholds(); + automatic threshs_t res; + for(int unsigned c = 0; c < C; c++) begin + automatic real m = M0 + c*CX; + automatic real b = B0 + c*CX; + foreach(res[c][i]) begin + res[c][i] = int'($ceil(m*i + b)); + end + end + return res; + endfunction : init_thresholds + localparam threshs_t THRESHS = init_thresholds(); + + //----------------------------------------------------------------------- + // Clock and Reset Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + #10ns; + @(posedge clk); + rst <= 0; + end + + //----------------------------------------------------------------------- + // DUT + logic s_axilite_AWVALID; + uwire s_axilite_AWREADY; + logic [ADDR_BITS-1:0] s_axilite_AWADDR; // lowest 2 bits (byte selectors) are ignored + logic s_axilite_WVALID; + uwire s_axilite_WREADY; + logic [ 31:0] s_axilite_WDATA; + uwire s_axilite_BVALID; + logic s_axilite_BREADY; + uwire [ 1:0] s_axilite_BRESP; + logic s_axilite_ARVALID; + uwire s_axilite_ARREADY; + logic [ADDR_BITS-1:0] s_axilite_ARADDR; + uwire s_axilite_RVALID; + uwire s_axilite_RREADY = 1; + uwire [ 31:0] s_axilite_RDATA; + uwire [ 1:0] s_axilite_RRESP; + + uwire irdy; + logic ivld; + logic [PE-1:0][K-1:0] idat; + + logic ordy = 0; + uwire ovld; + uwire [PE-1:0][N-1:0] odat; + + thresholding_axi #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(0), .USE_AXILITE(1)) dut ( + .ap_clk(clk), .ap_rst_n(!rst), + + // Configuration + .s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR, + .s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB('1), + .s_axilite_BVALID, .s_axilite_BREADY, .s_axilite_BRESP, + .s_axilite_ARVALID, .s_axilite_ARREADY, .s_axilite_ARADDR, + .s_axilite_RVALID, .s_axilite_RREADY, .s_axilite_RDATA, .s_axilite_RRESP, + + // Stream Processing + .s_axis_tready(irdy), .s_axis_tvalid(ivld), .s_axis_tdata(idat), + .m_axis_tready(ordy), .m_axis_tvalid(ovld), .m_axis_tdata(odat) + ); + + //----------------------------------------------------------------------- + // Input Stimuli + typedef logic [PE-1:0][K-1:0] input_t; + typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t; + input_t QW[$]; // Input Feed Tracing + addr_t QC[$]; + + int unsigned error_cnt = 0; + bit done = 0; + initial begin + // Report testbench details + $display("Testbench - tresholding K=%0d -> N=%0d", K, N); + for(int unsigned c = 0; c < C; c++) begin + $write("Channel #%0d: Thresholds = {", c); + for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0d", THRESHS[c][i]); + $display(" }"); + end + + // Config + s_axilite_AWVALID = 0; + s_axilite_AWADDR = 'x; + s_axilite_WVALID = 0; + s_axilite_WDATA = 'x; + s_axilite_BREADY = 0; + s_axilite_ARVALID = 0; + s_axilite_ARADDR = 'x; + + // Stream Input + ivld = 0; + idat = 'x; + + @(posedge clk iff !rst); + + // Threshold Configuration + for(int unsigned c = 0; c < C; c+=PE) begin + automatic addr_t addr = 0; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = c/PE; + for(int unsigned pe = 0; pe < PE; pe++) begin + if(PE > 1) addr[N+:$clog2(PE)] = pe; + for(int unsigned t = 0; t < 2**N-1; t++) begin + addr[0+:N] = t; + fork + begin + s_axilite_AWVALID <= 1; + s_axilite_AWADDR <= { addr, 2'b00 }; + @(posedge clk iff s_axilite_AWREADY); + s_axilite_AWVALID <= 0; + s_axilite_AWADDR <= 'x; + end + begin + s_axilite_WVALID <= 1; + s_axilite_WDATA <= THRESHS[c+pe][t]; + @(posedge clk iff s_axilite_WREADY); + s_axilite_WVALID <= 0; + s_axilite_WDATA <= 'x; + end + begin + s_axilite_BREADY <= 1; + @(posedge clk iff s_axilite_BVALID); + assert(s_axilite_BRESP == '0) else begin + $error("Error on parameter write."); + $stop; + end + s_axilite_BREADY <= 0; + end + join + end + end + end + + fork + // Intermittent configuration readback + while(!done) begin + if(($urandom()%37) != 0) begin + s_axilite_ARVALID <= 0; + s_axilite_ARADDR <= 'x; + @(posedge clk); + end + else begin + automatic addr_t addr = $urandom()%(N-1); + if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF; + + s_axilite_ARVALID <= 1; + s_axilite_ARADDR <= { addr, 2'b00 }; + @(posedge clk iff s_axilite_ARREADY); + + QC.push_back(addr); + end + end + + // AXI4Stream MST Writes input values + repeat(MST_STRM_WROUNDS) begin + automatic input_t dat; + + while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk); + + std::randomize(dat); + ivld <= 1; + idat <= dat; + @(posedge clk iff irdy); + ivld <= 0; + idat <= 'x; + QW.push_back(dat); + end + join_any + done <= 1; + repeat(N+6) @(posedge clk); + + assert(QW.size() == 0) else begin + $error("Missing %0d outputs.", QW.size()); + $stop; + end + assert(QC.size() == 0) else begin + $error("Missing %0d readback replies.", QC.size()); + $stop; + end + + $display("Test completed: %0d errors in %0d tests.", error_cnt, MST_STRM_WROUNDS); + $display("========================================="); + $finish; + end + + // Output Checker ------------------------------------------------------- + + // Configuration Readback + always_ff @(posedge clk iff s_axilite_RVALID) begin + assert(s_axilite_RRESP == '0) else begin + $error("Read back error."); + $stop; + end + assert(QC.size()) begin + automatic addr_t addr = QC.pop_front(); + automatic int unsigned cnl = + (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) + + (PE == 1? 0 : addr[N+:$clog2(PE)]); + automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]]; + assert(s_axilite_RDATA == exp) else begin + $error("Readback mismatch on #%0d.%0d: %0d instead of %0d", cnl, addr[0+:N], s_axilite_RDATA, exp); + $stop; + end + end + else begin + $error("Spurious readback output."); + $stop; + end + end + + // Stream Output + int unsigned OCnl = 0; + always @(posedge clk) begin + if(rst) begin + OCnl <= 0; + ordy <= 1'b0; + end + else begin + if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED; + + if(ordy && ovld) begin + assert(QW.size()) begin + automatic input_t x = QW.pop_front(); + + for(int unsigned pe = 0; pe < PE; pe++) begin + automatic int unsigned cnl = OCnl + pe; + + $display("Mapped CNL=%0d DAT=%3d -> #%2d", cnl, x[pe], odat[pe]); + assert( + ((odat[pe] == 0) || (THRESHS[cnl][odat[pe]-1] <= x[pe])) && + ((odat[pe] == 2**N-1) || (x[pe] < THRESHS[cnl][odat[pe]])) + ) else begin + $error("Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", cnl, x[pe], odat[pe]); + error_cnt++; + $stop; + end + end + end + else begin + $error("Spurious output."); + $stop; + end + + OCnl <= (OCnl + PE)%C; + end + end + end + +endmodule: thresholding_axi_tb diff --git a/finn-rtllib/thresholding/sim/thresholding_tb.sv b/finn-rtllib/thresholding/sim/thresholding_tb.sv new file mode 100644 index 0000000000..e42145f10e --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresholding_tb.sv @@ -0,0 +1,274 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for thresholding_axi. + * @author Monica Chiosa + * + */ + +module thresholding_tb #( + int unsigned K = 10, // input precision + int unsigned N = 4, // output precision + int unsigned C = 6, // number of channels + int unsigned PE = 2, + + localparam int unsigned CF = C/PE // Channel Fold +); + localparam bit DEEP_PIPELINE = 1; + + localparam int unsigned MST_STRM_WROUNDS = 507; + localparam bit THROTTLED = 1; + + //----------------------------------------------------------------------- + // Clock and Reset Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + #10ns; + @(posedge clk); + rst <= 0; + end + + //----------------------------------------------------------------------- + // Parallel Instances differing in Data Type + typedef logic [K -1:0] val_t; + typedef val_t threshs_t[C][2**N-1]; + typedef val_t [PE-1:0] input_t; + typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t; + logic [0:2] term = '0; + always_comb begin + if(&term) $finish; + end + for(genvar i = 0; i < 3; i++) begin : genTypes + localparam bit SIGNED = i>0; + localparam bit FPARG = i>1; + + //- DUT ------------------------- + logic cfg_en; + logic cfg_we; + logic [$clog2(C)+N-1:0] cfg_a; + logic [K-1:0] cfg_d; + uwire cfg_rack; + uwire [K-1:0] cfg_q; + + uwire irdy; + logic ivld; + logic [PE-1:0][K-1:0] idat; + + logic ordy = 0; + uwire ovld; + uwire [PE-1:0][N-1:0] odat; + + thresholding #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(SIGNED), .FPARG(FPARG), .USE_CONFIG(1), .DEEP_PIPELINE(DEEP_PIPELINE)) dut ( + .clk, .rst, + + // Configuration + .cfg_en, .cfg_we, .cfg_a, .cfg_d, + .cfg_rack, .cfg_q, + + // Stream Processing + .irdy, .ivld, .idat, + .ordy, .ovld, .odat + ); + + //- Stimulus Driver ------------- + threshs_t THRESHS; + function val_t sigord(input val_t x); + automatic val_t res = x; + if(SIGNED) begin + if(FPARG && x[K-1]) res[K-2:0] = ~x[K-2:0]; + res[K-1] = !x[K-1]; + end + return res; + endfunction : sigord + + input_t QW[$]; // Input tracing + addr_t QC[$]; // Readback tracking + int unsigned error_cnt = 0; + bit done = 0; + initial begin + + // Generate thresholds + std::randomize(THRESHS); + foreach(THRESHS[c]) begin + val_t row[2**N-1] = THRESHS[c]; + row.sort with (sigord(item)); + THRESHS[c] = row; + end + + // Report test case details + $display("[%0d] Thresholding %s%s%0d -> uint%0d", i, SIGNED? "s" : "u", FPARG? "fp" : "int", K, N); + for(int unsigned c = 0; c < C; c++) begin + $write("[%0d] Channel #%0d: Thresholds = {", i, c); + for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0X", THRESHS[c][i]); + $display(" }"); + end + + // Config + cfg_en = 0; + cfg_we = 'x; + cfg_a = 'x; + cfg_d = 'x; + + // Stream Input + ivld = 0; + idat = 'x; + + @(posedge clk iff !rst); + + // Threshold Configuratin + cfg_en <= 1; + cfg_we <= 1; + for(int unsigned c = 0; c < C; c+=PE) begin + if(CF > 1) cfg_a[N+$clog2(PE)+:$clog2(CF)] <= c/PE; + for(int unsigned pe = 0; pe < PE; pe++) begin + if(PE > 1) cfg_a[N+:$clog2(PE)] = pe; + for(int unsigned t = 0; t < 2**N-1; t++) begin + cfg_a[0+:N] <= t; + cfg_d <= THRESHS[c+pe][t]; + @(posedge clk); + end + end + end + cfg_d <= 'x; + + fork + // Intermittent configuration readback + while(!done) begin + cfg_en <= 0; + cfg_we <= 'x; + cfg_a <= 'x; + @(posedge clk); + if(($urandom()%41) == 0) begin + automatic addr_t addr = $urandom()%(N-1); + if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF; + + cfg_en <= 1; + cfg_we <= 0; + cfg_a <= addr; + @(posedge clk); + QC.push_back(addr); + end + end + + // AXI4Stream MST Writes input values + repeat(MST_STRM_WROUNDS) begin + automatic input_t dat; + + while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk); + + std::randomize(dat); + ivld <= 1; + idat <= dat; + @(posedge clk iff irdy); + ivld <= 0; + idat <= 'x; + QW.push_back(dat); + end + join_any + done <= 1; + repeat((DEEP_PIPELINE+1)*N+6) @(posedge clk); + + assert(QW.size() == 0) else begin + $error("[%0d] Missing %0d outputs.", i, QW.size()); + $stop; + end + assert(QC.size() == 0) else begin + $error("[%0d] Missing %0d readback replies.", i, QC.size()); + $stop; + end + + $display("[%0d] Test completed: %0d errors in %0d tests.", i, error_cnt, MST_STRM_WROUNDS); + $display("============================================="); + term[i] <= 1; + end + + //- Readback Checker -------------- + always_ff @(posedge clk iff cfg_rack) begin + assert(QC.size()) begin + automatic addr_t addr = QC.pop_front(); + automatic int unsigned cnl = + (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) + + (PE == 1? 0 : addr[N+:$clog2(PE)]); + automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]]; + assert(cfg_q == exp) else begin + $error("[%0d] Readback mismatch on #%0d.%0d: %0d instead of %0d", i, cnl, addr[0+:N], cfg_q, exp); + $stop; + end + end + else begin + $error("[%0d] Spurious readback output.", i); + $stop; + end + end + + // Output Checker + int unsigned OCnl = 0; + always @(posedge clk) begin + if(rst) begin + OCnl <= 0; + ordy <= 1'b0; + end + else begin + if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED; + + if(ordy && ovld) begin + assert(QW.size()) begin + automatic input_t x = QW.pop_front(); + + for(int unsigned pe = 0; pe < PE; pe++) begin + automatic int unsigned cnl = OCnl + pe; + + $display("[%0d] Mapped CNL=%0d DAT=%3x -> #%2d", i, cnl, x[pe], odat[pe]); + assert( + ((odat[pe] == 0) || (sigord(THRESHS[cnl][odat[pe]-1]) <= sigord(x[pe]))) && + ((odat[pe] == 2**N-1) || (sigord(x[pe]) < sigord(THRESHS[cnl][odat[pe]]))) + ) else begin + $error("[%0d] Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", i, cnl, x[pe], odat[pe]); + error_cnt++; + $stop; + end + end + end + else begin + $error("[%0d] Spurious output.", i); + $stop; + end + + OCnl <= (OCnl + PE)%C; + end + end + end + + end : genTypes + +endmodule: thresholding_tb diff --git a/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl b/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl new file mode 100644 index 0000000000..338304fa40 --- /dev/null +++ b/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl @@ -0,0 +1,187 @@ + +# Loading additional proc with user specified bodies to compute parameter values. +source [file join [file dirname [file dirname [info script]]] gui/thresholding_axi_v1_0.gtcl] + +# Definitional proc to organize widgets for parameters. +proc init_gui { IPINST } { + ipgui::add_param $IPINST -name "Component_Name" + #Adding Page + set Page_0 [ipgui::add_page $IPINST -name "Page 0"] + ipgui::add_param $IPINST -name "ADDR_BITS" -parent ${Page_0} + ipgui::add_param $IPINST -name "BIAS" -parent ${Page_0} + ipgui::add_param $IPINST -name "C" -parent ${Page_0} + ipgui::add_param $IPINST -name "CF" -parent ${Page_0} + ipgui::add_param $IPINST -name "FPARG" -parent ${Page_0} + ipgui::add_param $IPINST -name "K" -parent ${Page_0} + ipgui::add_param $IPINST -name "N" -parent ${Page_0} + ipgui::add_param $IPINST -name "O_BITS" -parent ${Page_0} + set PE [ipgui::add_param $IPINST -name "PE" -parent ${Page_0}] + set_property tooltip {PE Count} ${PE} + ipgui::add_param $IPINST -name "SIGNED" -parent ${Page_0} + + +} + +proc update_PARAM_VALUE.ADDR_BITS { PARAM_VALUE.ADDR_BITS PARAM_VALUE.C PARAM_VALUE.PE PARAM_VALUE.N } { + # Procedure called to update ADDR_BITS when any of the dependent parameters in the arguments change + + set ADDR_BITS ${PARAM_VALUE.ADDR_BITS} + set C ${PARAM_VALUE.C} + set PE ${PARAM_VALUE.PE} + set N ${PARAM_VALUE.N} + set values(C) [get_property value $C] + set values(PE) [get_property value $PE] + set values(N) [get_property value $N] + set_property value [gen_USERPARAMETER_ADDR_BITS_VALUE $values(C) $values(PE) $values(N)] $ADDR_BITS +} + +proc validate_PARAM_VALUE.ADDR_BITS { PARAM_VALUE.ADDR_BITS } { + # Procedure called to validate ADDR_BITS + return true +} + +proc update_PARAM_VALUE.CF { PARAM_VALUE.CF PARAM_VALUE.C PARAM_VALUE.PE } { + # Procedure called to update CF when any of the dependent parameters in the arguments change + + set CF ${PARAM_VALUE.CF} + set C ${PARAM_VALUE.C} + set PE ${PARAM_VALUE.PE} + set values(C) [get_property value $C] + set values(PE) [get_property value $PE] + set_property value [gen_USERPARAMETER_CF_VALUE $values(C) $values(PE)] $CF +} + +proc validate_PARAM_VALUE.CF { PARAM_VALUE.CF } { + # Procedure called to validate CF + return true +} + +proc update_PARAM_VALUE.O_BITS { PARAM_VALUE.O_BITS PARAM_VALUE.BIAS PARAM_VALUE.N } { + # Procedure called to update O_BITS when any of the dependent parameters in the arguments change + + set O_BITS ${PARAM_VALUE.O_BITS} + set BIAS ${PARAM_VALUE.BIAS} + set N ${PARAM_VALUE.N} + set values(BIAS) [get_property value $BIAS] + set values(N) [get_property value $N] + set_property value [gen_USERPARAMETER_O_BITS_VALUE $values(BIAS) $values(N)] $O_BITS +} + +proc validate_PARAM_VALUE.O_BITS { PARAM_VALUE.O_BITS } { + # Procedure called to validate O_BITS + return true +} + +proc update_PARAM_VALUE.BIAS { PARAM_VALUE.BIAS } { + # Procedure called to update BIAS when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.BIAS { PARAM_VALUE.BIAS } { + # Procedure called to validate BIAS + return true +} + +proc update_PARAM_VALUE.C { PARAM_VALUE.C } { + # Procedure called to update C when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.C { PARAM_VALUE.C } { + # Procedure called to validate C + return true +} + +proc update_PARAM_VALUE.FPARG { PARAM_VALUE.FPARG } { + # Procedure called to update FPARG when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.FPARG { PARAM_VALUE.FPARG } { + # Procedure called to validate FPARG + return true +} + +proc update_PARAM_VALUE.K { PARAM_VALUE.K } { + # Procedure called to update K when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.K { PARAM_VALUE.K } { + # Procedure called to validate K + return true +} + +proc update_PARAM_VALUE.N { PARAM_VALUE.N } { + # Procedure called to update N when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.N { PARAM_VALUE.N } { + # Procedure called to validate N + return true +} + +proc update_PARAM_VALUE.PE { PARAM_VALUE.PE } { + # Procedure called to update PE when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.PE { PARAM_VALUE.PE } { + # Procedure called to validate PE + return true +} + +proc update_PARAM_VALUE.SIGNED { PARAM_VALUE.SIGNED } { + # Procedure called to update SIGNED when any of the dependent parameters in the arguments change +} + +proc validate_PARAM_VALUE.SIGNED { PARAM_VALUE.SIGNED } { + # Procedure called to validate SIGNED + return true +} + + +proc update_MODELPARAM_VALUE.N { MODELPARAM_VALUE.N PARAM_VALUE.N } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.N}] ${MODELPARAM_VALUE.N} +} + +proc update_MODELPARAM_VALUE.K { MODELPARAM_VALUE.K PARAM_VALUE.K } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.K}] ${MODELPARAM_VALUE.K} +} + +proc update_MODELPARAM_VALUE.C { MODELPARAM_VALUE.C PARAM_VALUE.C } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.C}] ${MODELPARAM_VALUE.C} +} + +proc update_MODELPARAM_VALUE.PE { MODELPARAM_VALUE.PE PARAM_VALUE.PE } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.PE}] ${MODELPARAM_VALUE.PE} +} + +proc update_MODELPARAM_VALUE.SIGNED { MODELPARAM_VALUE.SIGNED PARAM_VALUE.SIGNED } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.SIGNED}] ${MODELPARAM_VALUE.SIGNED} +} + +proc update_MODELPARAM_VALUE.FPARG { MODELPARAM_VALUE.FPARG PARAM_VALUE.FPARG } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.FPARG}] ${MODELPARAM_VALUE.FPARG} +} + +proc update_MODELPARAM_VALUE.BIAS { MODELPARAM_VALUE.BIAS PARAM_VALUE.BIAS } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.BIAS}] ${MODELPARAM_VALUE.BIAS} +} + +proc update_MODELPARAM_VALUE.CF { MODELPARAM_VALUE.CF PARAM_VALUE.CF } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.CF}] ${MODELPARAM_VALUE.CF} +} + +proc update_MODELPARAM_VALUE.ADDR_BITS { MODELPARAM_VALUE.ADDR_BITS PARAM_VALUE.ADDR_BITS } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.ADDR_BITS}] ${MODELPARAM_VALUE.ADDR_BITS} +} + +proc update_MODELPARAM_VALUE.O_BITS { MODELPARAM_VALUE.O_BITS PARAM_VALUE.O_BITS } { + # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value + set_property value [get_property value ${PARAM_VALUE.O_BITS}] ${MODELPARAM_VALUE.O_BITS} +} diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 1796738c58..0a6c0b39c9 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -30,6 +30,7 @@ import subprocess import sys import tempfile +from qonnx.util.basic import roundup_to_integer_multiple # test boards test_board_map = ["Pynq-Z1", "KV260_SOM", "ZCU104", "U250"] @@ -76,6 +77,11 @@ alveo_default_platform["U280"] = "xilinx_u280_gen3x16_xdma_1_202211_1" alveo_default_platform["U55C"] = "xilinx_u55c_gen3x16_xdma_3_202210_1" +# Create a joint part map, encompassing other boards too +part_map = {**pynq_part_map, **alveo_part_map} +part_map["VEK280"] = "xcve2802-vsvh1760-2MP-e-S" +part_map["VCK190"] = "xcvc1902-vsva2197-2MP-e-S" + def get_rtlsim_trace_depth(): """Return the trace depth for rtlsim via PyVerilator. Controllable @@ -228,3 +234,67 @@ def is_exe(fpath): return exe_file return None + + +def find_next_power_of_2(n): + """For any integer 'n', find the next greatest power of 2""" + # Negative values will loop infinitely below - return 0 + if n <= 0: + return 0 + # If '1' is requested, output will be '0' in the loop below, avoid this now. + elif n == 1: + return 2 # i.e. 2**1 + + # decrement 'n' (to handle cases when `n` itself is a power of 2) + n = n - 1 + + # loop until only one bit is left + while n & n - 1: + # unset rightmost bit + n = n & n - 1 + return n << 1 + + +mem_primitives_versal = { + "URAM_72x4096": (72, 4096), + "URAM_36x8192": (36, 8192), + "URAM_18x16384": (18, 16384), + "URAM_9x32768": (9, 32768), + "BRAM18_36x512": (36, 512), + "BRAM18_18x1024": (18, 1024), + "BRAM18_9x2048": (9, 2048), + "LUTRAM": (1, 64), +} + + +def get_memutil_alternatives( + req_mem_spec, mem_primitives=mem_primitives_versal, sort_min_waste=True +): + ret = [ + (primitive_name, memutil(req_mem_spec, primitive_spec)) + for (primitive_name, primitive_spec) in mem_primitives.items() + ] + if sort_min_waste: + ret = sorted(ret, key=lambda x: x[1][2]) + return ret + + +def memutil(req_mem_spec, primitive_spec): + """Computes how many instances of a memory primitive are necessary to + implemented a desired memory size, where req_mem_spec is the desired + size and the primitive_spec is the primitve size. The sizes are expressed + as tuples of (mem_width, mem_depth). Returns (primitive_count, efficiency, waste) + where efficiency in range [0,1] indicates how much of the total capacity is + utilized, and waste indicates how many bits of storage are wasted.""" + + req_width, req_depth = req_mem_spec + prim_width, prim_depth = primitive_spec + + match_width = roundup_to_integer_multiple(req_width, prim_width) + match_depth = roundup_to_integer_multiple(req_depth, prim_depth) + count_width = match_width // prim_width + count_depth = match_depth // prim_depth + count = count_depth * count_width + eff = (req_width * req_depth) / (count * prim_width * prim_depth) + waste = (count * prim_width * prim_depth) - (req_width * req_depth) + return (count, eff, waste) diff --git a/tests/util/test_basic.py b/tests/util/test_basic.py new file mode 100755 index 0000000000..97a8c50261 --- /dev/null +++ b/tests/util/test_basic.py @@ -0,0 +1,60 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import finn.util.basic as basic + + +@pytest.mark.util +def test_next_power_of_2(): + test_vector = [ + {"input": -2, "expected_result": 0}, + {"input": -1, "expected_result": 0}, + {"input": 0, "expected_result": 0}, + {"input": 1, "expected_result": 2}, + {"input": 2, "expected_result": 2}, + {"input": 3, "expected_result": 4}, + {"input": 4, "expected_result": 4}, + {"input": 7, "expected_result": 8}, + {"input": 8, "expected_result": 8}, + {"input": 11, "expected_result": 16}, + {"input": 15, "expected_result": 16}, + {"input": 16, "expected_result": 16}, + {"input": 18, "expected_result": 32}, + {"input": 27, "expected_result": 32}, + {"input": 31, "expected_result": 32}, + {"input": 32, "expected_result": 32}, + {"input": 42, "expected_result": 64}, + {"input": 65, "expected_result": 128}, + ] + + for test_dict in test_vector: + output = basic.find_next_power_of_2(test_dict["input"]) + assert output >= test_dict["input"] + assert output == test_dict["expected_result"] From 36603f69609e969fede24cba87d7d35f7bf78aaa Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Tue, 30 Jan 2024 21:05:04 +0000 Subject: [PATCH 078/291] [tests] add rtl impl style to threshold test Signed-off-by: aziz bahri --- .../test_fpgadataflow_thresholding.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 43eca7b7c3..e88511f5cf 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -57,7 +57,7 @@ target_clk_ns = 5 -def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs): +def make_single_thresholding_modelwrapper(impl_style, T, pe, idt, odt, actval, mem_mode, n_inp_vecs): NumChannels = T.shape[0] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]) @@ -80,6 +80,7 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_i ActVal=actval, mem_mode=mem_mode, numInputVectors=n_inp_vecs, + preferred_impl_style=impl_style ) graph = helper.make_graph( nodes=[Thresholding_node], @@ -111,10 +112,11 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_i @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # memory mode @pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): +def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_mode): if nf == -1: nf = ich pe = ich // nf @@ -135,7 +137,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): else: actval = odt.min() - model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs) + model = make_single_thresholding_modelwrapper(impl_style,T, pe, idt, odt, actval, mem_mode, n_inp_vecs) # calculate reference output # multithreshold util fxn wants NCHW input, not NHWC @@ -196,10 +198,10 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) assert exp_cycles != 0 - +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_single_layer(): +def test_runtime_thresholds_single_layer(impl_style): n_inp_vecs = [1, 2, 2] mem_mode = "decoupled" act = DataType["INT4"] @@ -223,8 +225,10 @@ def test_runtime_thresholds_single_layer(): else: actval = odt.min() - model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs) + model = make_single_thresholding_modelwrapper(impl_style, T, pe, idt, odt, actval, mem_mode, n_inp_vecs) model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) + op_inst = getCustomOp(model.graph.node[0]) op_inst.set_nodeattr("runtime_writeable_weights", 1) op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat") From 8843c0e204c31ac82ee753bfee66526cf0277d94 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Tue, 30 Jan 2024 21:51:17 +0000 Subject: [PATCH 079/291] [CustomOp] Add Thresholding RTL Class --- .../custom_op/fpgadataflow/rtl/__init__.py | 2 + .../fpgadataflow/rtl/thresholding_rtl.py | 776 ++++++++++++++++++ .../fpgadataflow/specialize_layers.py | 1 - 3 files changed, 778 insertions(+), 1 deletion(-) create mode 100644 src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 914c033584..ae1f4e6acf 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -34,6 +34,7 @@ StreamingDataWidthConverter_rtl, ) from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl +from finn.custom_op.fpgadataflow.rtl.thresholding_rtl import Thresholding_rtl custom_op = dict() @@ -43,3 +44,4 @@ custom_op["FMPadding_rtl"] = FMPadding_rtl custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl +custom_op["Thresholding_rtl"] = Thresholding_rtl diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py new file mode 100644 index 0000000000..63abdd1545 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -0,0 +1,776 @@ +# Copyright (C) 2022, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import shutil +import warnings +from pyverilator.util.axi_utils import rtlsim_multi_io +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.util.basic import ( + find_next_power_of_2, + get_memutil_alternatives, + get_rtlsim_trace_depth, + make_build_dir, + mem_primitives_versal, + pyverilate_get_liveness_threshold_cycles, +) +from finn.util.data_packing import ( + npy_to_rtlsim_input, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + +"""@package Thresholding_rtl +- ONNX i/o tensor shape assumptions for Thresholding: +- input 0 is the input tensor, shape (..., NumChannels) +- input 1 is the threshold tensor, shape (NumChannels, n_thres) +- output 0 is the output tensor, shape (..., NumChannels) - same as input +- the '...' here can be any shape (representing groups of vectors) + +This module creates an RTL IP, HLS is not supported. See 'thresholding_batch' +for a HLS equivalent. +""" + + +class Thresholding_rtl(HLSCustomOp): + """Class that corresponds to finn-rtllib 'thresholding' function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # parallelization; channels thresholded per cycle + "PE": ("i", True, 0), + # number of channels (each may have different thresholds) + "NumChannels": ("i", True, 0), + # number of steps in thresholding function. Used only in decoupled mode + "numSteps": ("i", True, 1), + # FINN DataTypes for inputs, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + # name of the top module in verilog template. Used by PyVerilator + # and IPI generation + "gen_top_module": ("s", False, ""), + # bias to be applied to outputs of the node + "activation_bias": ("i", False, 0), + # whether weights (thresholds) will be + # writable through an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # memory depth triggers for threshold storage + "depth_trigger_uram": ("i", False, 0), + "depth_trigger_bram": ("i", False, 0), + # enable uniform thres optimization + # doesn't actually do anything yet, only + # for resource estimations + "uniform_thres": ("i", False, 0, {0, 1}), + # enable deep pipelining for easier timing closure + # setting to 0 may save some FFs but otherwise leave on + "deep_pipeline": ("i", False, 1, {0, 1}), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_pe_mem_geometries(self): + pe = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + wdt_bits = wdt.bitwidth() + odt = self.get_output_datatype() + odt_bits = odt.bitwidth() + t_channels = self.get_nodeattr("NumChannels") + cf = t_channels / pe + is_uniform = self.get_nodeattr("uniform_thres") + if is_uniform: + ret = [(odt_bits - x, cf * (2**x)) for x in range(1, odt_bits)] + else: + ret = [(wdt_bits, (cf) * 2**x) for x in range(odt_bits)] + return ret + + def get_memory_estimate(self): + res_dict = {} + depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") + depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") + pe = self.get_nodeattr("PE") + ret = self.get_pe_mem_geometries() + for mem_cfg in ret: + (width, depth) = mem_cfg + primitives = mem_primitives_versal + if depth_trigger_bram != 0 or depth_trigger_uram != 0: + if depth >= depth_trigger_bram and depth < depth_trigger_uram: + primitives = {k: v for (k, v) in mem_primitives_versal.items() if "BRAM" in k} + elif depth >= depth_trigger_uram: + primitives = {k: v for (k, v) in mem_primitives_versal.items() if "URAM" in k} + alts = get_memutil_alternatives(mem_cfg, primitives) + primary_alt = alts[0] + res_type = primary_alt[0].split("_")[0] + res_count, eff, waste = primary_alt[1] + res_dict[res_type] = res_dict.get(res_type, 0) + pe * res_count + return res_dict + + def calc_tmem(self): + """Calculates and returns TMEM.""" + num_channels = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return num_channels // pe + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + """Used for FINN DataType inference: set the output tensors' datatypes + accordingly for this node""" + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype().name), + str(idt.name), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + """Required by the FINN nalysis module. Checks if custom ops in graph + are correctly built, with all attributes and inputs.""" + return [] + + def bram_estimation(self): + res_dict = self.get_memory_estimate() + return res_dict.get("BRAM", 0) + + def uram_estimation(self): + res_dict = self.get_memory_estimate() + return res_dict.get("URAM", 0) + + def lut_estimation(self): + res_dict = self.get_memory_estimate() + return res_dict.get("LUTRAM", 0) + + def get_input_datatype(self, ind=0): + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + return DataType[self.get_nodeattr("outputDataType")] + + def get_weight_datatype(self): + """The term 'weights' and 'thresholds' are used interchangably in this class.""" + return DataType[self.get_nodeattr("weightDataType")] + + def minimize_accumulator_width(self, model): + "Minimize threshold width ('accumulator width' here due to convention)" + thresholds = model.get_initializer(self.onnx_node.input[1]) + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + min_input = self.get_input_datatype().min() + max_input = self.get_input_datatype().max() + # get range required by threshold values + tdt_min = min(min_input, min_threshold) + tdt_max = max(max_input, max_threshold) + if tdt_min < 0: + if abs(tdt_min) > tdt_max: + tdt = DataType.get_smallest_possible(tdt_min) + else: + tdt = DataType.get_smallest_possible(-tdt_max - 1) + else: + tdt = DataType.get_smallest_possible(tdt_max) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + self.set_nodeattr("weightDataType", tdt.name) + return DataType[self.get_nodeattr("weightDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_weightstream_width(self): + """Returns weight stream width""" + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + n_thres_steps = self.get_nodeattr("numSteps") + w_width = pe * wp * n_thres_steps + return w_width + + def get_folded_input_shape(self, ind=0): + fold = self.calc_tmem() + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self, ind=0): + num_channels = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [num_channels]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for unsigned inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ + mh = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + tmem = mh // pe + assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." + assert ( + orig_thres_matrix.ndim == 2 + ), """Threshold matrix dimension is + not as expected (2).""" + n_thres_steps = orig_thres_matrix.shape[1] + assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" + if not self.get_input_datatype().signed(): + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" + ret = orig_thres_matrix + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (mh, 1)) + assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + assert ( + ret.shape[2] == n_thres_steps + ), """Third dimension after distribution of the + rows between PEs is not as expected (n_thres_steps)""" + return ret.reshape(1, pe, tmem, n_thres_steps) + + def get_all_meminit_filenames(self, abspath=False): + "Return a list of all .dat memory initializer files used for this node" + dat_files = [] + t_path = self.get_nodeattr("code_gen_dir_ipgen") if abspath else "." + pe = self.get_nodeattr("PE") + output_data_type = self.get_nodeattr("outputDataType") # output precision + o_bitwidth = DataType[output_data_type].bitwidth() + for stage in range(o_bitwidth): + for pe_value in range(pe): + thresh_file = t_path + "/%s_threshs_%s_%s.dat" % ( + self.onnx_node.name, + pe_value, + stage, + ) + dat_files.append(thresh_file) + return dat_files + + def prepare_codegen_rtl_values(self, model): + """All dictionary values produced in this function are to replace + their key value(s) in the RTL template files""" + code_gen_dict = {} + + # TODO check for sortedness and size here? + # RTL component currently always expects 2^N-1 thresholds, but + # sometimes we have fewer due to e.g. narrow range quantization + thresholds = model.get_initializer(self.onnx_node.input[1]) + # add dummy dimension as final dimension (that's what gets packed with next call) + thresholds = np.expand_dims(thresholds, axis=-1) + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 4) + t_packed = pack_innermost_dim_as_hex_string( + thresholds, + wdt, + bw_hexdigit, + prefix="", + ) + + t_path = self.get_nodeattr("code_gen_dir_ipgen") + pe = self.get_nodeattr("PE") + output_data_type = self.get_nodeattr("outputDataType") # output precision + o_bitwidth = DataType[output_data_type].bitwidth() + num_channels = self.get_nodeattr("NumChannels") # number of channels + + channel_fold = int(num_channels / pe) + + for stage in range(o_bitwidth): + sn = o_bitwidth - stage - 1 + for pe_value in range(pe): + thresh_file = t_path + "/%s_threshs_%s_%s.dat" % ( + self.onnx_node.name, + pe_value, + stage, + ) + threshs = np.zeros([channel_fold * (2**stage)], dtype="object") + for ch in range(channel_fold): + for i in range(2**stage): + threshs[(ch << stage) + i] = t_packed[ch * pe + pe_value][ + (i << (o_bitwidth - stage)) + 2**sn - 1 + ] + with open(thresh_file, "w") as f: + for val in threshs: + f.write(val + "\n") + code_gen_dict["$THRESHOLDS_PATH$"] = ['"./%s_"' % self.onnx_node.name] + + # Identify the module name + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ + self.get_verilog_top_module_name() + "_axi_wrapper" + ] + # Set the top module name - AXI wrapper + code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] + + # Identify the module variables + input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision + bias = self.get_nodeattr("activation_bias") # activation bias value + i_bitwidth = DataType[input_data_type].bitwidth() + + code_gen_dict["$N$"] = [str(o_bitwidth)] # output precision - convert bitwidth to string + code_gen_dict["$M$"] = [ + str(i_bitwidth) + ] # input/threshold precision - convert bitwidth to string + code_gen_dict["$C$"] = [str(num_channels)] # number of channels + code_gen_dict["$BIAS$"] = [str(bias)] # activation bias value + code_gen_dict["$PE$"] = [str(pe)] # requires C = M*PE + + # Is the input datatype signed or unsigned? + # The thresholding core needs to know this when comparing weights to inputs + if self.get_input_datatype().signed(): + code_gen_dict["$SIGNED$"] = [str(1)] + else: + code_gen_dict["$SIGNED$"] = [str(0)] + + if bias >= 0: + o_bits = math.ceil(math.log2(2**o_bitwidth + bias)) + else: + o_bits = 1 + math.ceil( + math.log2(-bias if -bias >= 2 ** (o_bitwidth - 1) else 2**o_bitwidth + bias) + ) + + code_gen_dict["$O_BITS$"] = [str(int(o_bits))] + + rt_weights = self.get_nodeattr("runtime_writeable_weights") + code_gen_dict["$USE_AXILITE$"] = [str(rt_weights)] + + depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") + depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") + deep_pipeline = self.get_nodeattr("deep_pipeline") + code_gen_dict["$DEPTH_TRIGGER_URAM$"] = [str(depth_trigger_uram)] + code_gen_dict["$DEPTH_TRIGGER_BRAM$"] = [str(depth_trigger_bram)] + code_gen_dict["$DEEP_PIPELINE$"] = [str(deep_pipeline)] + return code_gen_dict + + def get_rtl_file_list(self): + """Thresholding binary search RTL file list""" + return [ + "axilite_if.v", + "thresholding.sv", + "thresholding_axi.sv", + "thresholding_template_wrapper.v", + ] + + def get_rtl_file_paths(self): + """Get full path of all RTL files""" + rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/" + rtl_file_list = self.get_rtl_file_list() + rtl_file_paths = [rtl_root_dir + file for file in rtl_file_list] + return rtl_file_paths + + def get_rtl_template_data(self, path): + """Return RTL file contents as a template""" + with open(path, "r") as f: + template = f.read() + return template + + def fill_in_rtl_template_data(self, replace_dict, template_data): + """Use attribute values to finn in RTL template placeholders""" + template_data_cp = template_data + for key in replace_dict: + replacement_line = "\n".join(replace_dict[key]) + template_data_cp = template_data_cp.replace(key, replacement_line) + return template_data_cp + + def dump_rtl_data(self, dest_dir, filename, data): + """Dump filled-in-template RTL files for future synthesis step""" + # when generating template files, handle a special case: + # if the filename contains the word "template", replace that + # with the node name to distinguish between instances + filename = filename.replace("template", self.onnx_node.name) + with open(os.path.join(dest_dir, filename), "w") as f: + f.write(data) + return + + def generate_hdl(self, model): + """Prepare HDL files from templates for synthesis""" + # Generate a dictionary of values to put in RTL template + code_gen_dict = self.prepare_codegen_rtl_values(model) + + # Retrieve the destination directory for the final RTL files + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + for rtl_file_path in self.get_rtl_file_paths(): + # read in original RTL template file + template_data = self.get_rtl_template_data(rtl_file_path) + # apply code generation to templates + data = self.fill_in_rtl_template_data(code_gen_dict, template_data) + # dump filled-in template to destination directory for compilation + file_only_path = rtl_file_path.split("/")[-1] + self.dump_rtl_data(code_gen_dir, file_only_path, data) + + # Before we return - set the 'gen_top_module' attribute for use later + # by PyVerilator and IPI generation + self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) + return + + def code_generation_ipgen(self, model, fpgapart, clk): + self.generate_hdl(model) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + # i.e. during the HLSSynthIP() transformation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + return + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list()] + dat_files = self.get_all_meminit_filenames(abspath=True) + single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") + for dat_file in dat_files: + shutil.copy(dat_file, single_src_dir) + + # build the Verilator emulation library + sim = PyVerilator.build( + verilog_files, + build_dir=single_src_dir, + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_nodeattr("gen_top_module"), + auto_eval=False, + ) + + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def execute_node(self, context, graph): + # Perform input checks + if self.get_nodeattr("exec_mode") != "rtlsim": + raise Exception( + "Invalid exec_mode value: {}; exec_mode must be set to '{}'".format( + self.get_nodeattr("exec_mode"), "rtlsim" + ) + ) + mode = self.get_nodeattr("exec_mode") + if mode == "cppsim": + raise Exception("cppsim not possible for RTL Thresholding, please set exec_mode to rtlsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + node = self.onnx_node + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for Thresholding_rtl") + in_ind += 1 + + # Create a PyVerilator wrapper of the RTLSim .so + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + io_names = self.get_verilog_top_module_intf_names() + istream_name = io_names["s_axis"][0][0] + ostream_name = io_names["m_axis"][0][0] + io_dict = { + "inputs": {istream_name: inp}, + "outputs": {ostream_name: []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"][ostream_name] + + # Manage output data + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + return + + def hls_sname(self): + """Get the naming convention used by Vitis HLS for stream signals + Example: the TDATA for a stream called "out" would be out_V_TDATA. + """ + # no additional prefix/suffix in interface names since this is an RTL component + return "" + + def rtlsim_multi_io(self, sim, io_dict): + "Run rtlsim for this node, supports multiple i/o streams." + + rtlsim_so = self.get_nodeattr("rtlsim_so") + so_dir = os.path.dirname(os.path.realpath(rtlsim_so)) + olcwd = os.getcwd() + os.chdir(so_dir) + + # signal name prefix + # TODO if the interface names on this component get standardized, + # it won't need its own rtlsim_multi_io variant anymore and can just + # use the base class one + sname = "_" + + trace_file = self.get_nodeattr("rtlsim_trace") + if trace_file == "default": + trace_file = self.onnx_node.name + ".vcd" + num_out_values = self.get_number_output_values() + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + num_out_values, + trace_file=trace_file, + sname=sname, + do_reset=True, + liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + ) + self.set_nodeattr("cycles_rtlsim", total_cycle_count) + os.chdir(olcwd) + + def code_generation_ipi(self): + """Constructs and returns the TCL commands for node instantiation as an RTL + block.""" + rtl_file_list = [x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list()] + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name + cmd = ["file mkdir %s" % source_target] + + for rtl_file in rtl_file_list: + cmd.append( + "add_files -copy_to %s -norecurse %s" + % (source_target, os.path.join(code_gen_dir, rtl_file)) + ) + + # Create an RTL block, not an IP core (-type ip) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ) + + return cmd + + def get_verilog_top_module_intf_names(self): + """Return a dict of names of input and output interfaces. + The keys reflect the protocols each interface implements: + 'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'. + Values are lists of tuples (axis, aximm) or names (axilite): + 'axis' tuples correspond to the list of node inputs in order, + each tuple is (interface_name, interface_width_bits). + axilite always assumed to be 32 bits and is not tuple (name only). + Each block must have at most one aximm and one axilite.""" + + intf_names = {} + intf_names["clk"] = ["ap_clk"] + intf_names["rst"] = ["ap_rst_n"] + intf_names["s_axis"] = [("in0_V", self.get_instream_width_padded())] + intf_names["m_axis"] = [("out_V", self.get_outstream_width_padded())] + intf_names["aximm"] = [] + intf_names["axilite"] = [] + intf_names["ap_none"] = [] + if self.get_nodeattr("runtime_writeable_weights") == 1: + intf_names["axilite"] = ["s_axilite"] + + return intf_names + + def get_dynamic_config(self, model, address_stride=1): + """Returns a configuration dictionary containing axilite write commands + in order to program the thresholds into the RTL core during runtime. + The default address stride for the weights is 1 byte.""" + + thresholds = model.get_initializer(self.onnx_node.input[1]) + num_channels, num_weights_per_channel = thresholds.shape + + weight_addr_boundary = find_next_power_of_2(num_weights_per_channel) + # Make sure that the next power of 2 (output) is greater than the input + assert weight_addr_boundary >= num_weights_per_channel + + config = {} + channel_cntr = 0 + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 4) + for channel in thresholds: + channel_start_addr = channel_cntr * weight_addr_boundary * address_stride + weight_cntr = 0 + addr = 0 + for weight in channel: + key_name = "{}_{}{}_{}{}".format( + "axilite", "ch", str(channel_cntr), "w", str(weight_cntr) + ) + config[key_name] = ( + channel_start_addr + addr, + int( + str( + pack_innermost_dim_as_hex_string( + [weight], + wdt, + bw_hexdigit, + ) + ), + 0, + ), + ) + + weight_cntr += 1 + addr += address_stride + + channel_cntr += 1 + + return config + + def ipgen_singlenode_code(self): + """Normally: Builds the bash script for IP generation.""" + """This is needed for the HLSSynthIP() transformation. + This is an IP, not a HLS node, so therefore provide an empty hook + to prevent any HLS synthesis.""" + pass + + def global_includes(self): + pass + + def defines(self, var): + pass + + def read_npy_data(self): + pass + + def strm_decl(self): + pass + + def docompute(self): + pass + + def dataoutstrm(self): + pass + + def save_as_npy(self): + pass + + def blackboxfunction(self): + pass + + def pragmas(self): + pass diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 7fda50c965..31da3756d3 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -39,7 +39,6 @@ restricted_layers = [] restricted_layers.append("MatrixVectorActivation") restricted_layers.append("VectorVectorActivation") -restricted_layers.append("Thresholding") def _determine_impl_style(node): From 7b272bde95a4be015a1f8b0023e96d30d3f3d17c Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 31 Jan 2024 10:48:19 +0000 Subject: [PATCH 080/291] [CustomOp] Clean up tests and move dynamic mode in swg hw abstraction layer --- .../fpgadataflow/convolutioninputgenerator.py | 3 +++ .../fpgadataflow/rtl/convolutioninputgenerator_rtl.py | 3 --- .../transformation/fpgadataflow/specialize_layers.py | 9 ++++++++- ...test_fpgadataflow_convinputgenerator_rtl_dynamic.py | 10 +++++++--- tests/fpgadataflow/test_fpgadataflow_dwc.py | 1 + 5 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 3be0a117a8..96f49069c7 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -76,6 +76,9 @@ def get_nodeattr_types(self): "parallel_window": ("i", False, 0, {0, 1}), # 1D (True) or 2D (False) spatial data "is1D": ("i", False, 0), + # Enable reprogrammable implementation to change FM dimensions, + # stride, or dilation during runtime (requires parallel_window = 0) + "dynamic_mode": ("i", False, 0, {0, 1}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs diff --git a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py index ba3921745f..6f4bafd73a 100755 --- a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py @@ -71,9 +71,6 @@ def get_nodeattr_types(self): my_attrs = { # additional parallelization parameter - not yet implemented "M": ("i", False, 1), - # Enable reprogrammable implementation to change FM dimensions, - # stride, or dilation during runtime (requires parallel_window = 0) - "dynamic_mode": ("i", False, 0, {0, 1}), } my_attrs.update(ConvolutionInputGenerator.get_nodeattr_types(self)) my_attrs.update(RTLBackend.get_nodeattr_types(self)) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 7fda50c965..d06f7d524e 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -163,10 +163,17 @@ def _dwc_determine_impl_style(node): def _swg_hls_possible(node): + # there are some constraints to + # the HLS variant of the SWG + # first constraint to check is + # if user has set dynamic_mode to 1 + # this is only supported in rtl variant + swg = getCustomOp(node) + if swg.get_nodeattr("dynamic_mode"): + return False # the 2D HLS implementation for SWG # can only be used for square inputs # and no dilation - swg = getCustomOp(node) if swg.get_nodeattr("is1D"): return True else: diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py index f5a06316e2..368bdbb2ad 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py @@ -49,6 +49,7 @@ import finn.core.onnx_exec as oxe import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.core.onnx_exec import execute_onnx from finn.core.rtlsim_exec import rtlsim_exec @@ -60,6 +61,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import pyverilate_get_liveness_threshold_cycles @@ -404,7 +406,7 @@ def make_single_slidingwindow_modelwrapper( ) SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator_rtl", + "ConvolutionInputGenerator", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -518,9 +520,11 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( dw=dw, ) + model = model.transform(SpecializeLayers()) # Simulate using stitched-ip-rtlsim so we can use existing infrastructure # that supports hook functions to re-program configuration before rtlsim model = model.transform(InsertFIFO(True)) # required for proper simulation + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) model = model.transform(HLSSynthIP()) @@ -547,7 +551,7 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( configs = [("s_axilite_0_", config)] # Also update FIFO nodes and corresponding tensors - fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[0] + fifo_node = model.get_nodes_by_op_type("StreamingFIFO_rtl")[0] fifo_inst = getCustomOp(fifo_node) shape = fifo_inst.get_nodeattr("folded_shape") shape[1] = ifm_dim_h @@ -555,7 +559,7 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( fifo_inst.set_nodeattr("folded_shape", shape) update_tensor_dim(model, fifo_node.input[0], ifm_dim) - fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[1] + fifo_node = model.get_nodes_by_op_type("StreamingFIFO_rtl")[1] fifo_inst = getCustomOp(fifo_node) shape = fifo_inst.get_nodeattr("folded_shape") shape[1] = ofm_dim_h diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py index 706b3d2065..d46815ebac 100644 --- a/tests/fpgadataflow/test_fpgadataflow_dwc.py +++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py @@ -164,6 +164,7 @@ def test_fpgadataflow_dwc_stitched_rtlsim(config): model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype) model = model.transform(SpecializeLayers()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) From 2cafe59154b24b73f21445f98bccf65c0ca7f522 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 31 Jan 2024 10:54:49 +0000 Subject: [PATCH 081/291] [Tests] Fix linting for swg dynamic test --- .../test_fpgadataflow_convinputgenerator_rtl_dynamic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py index 368bdbb2ad..ee37ab86ef 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py @@ -49,7 +49,6 @@ import finn.core.onnx_exec as oxe import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls -import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.core.onnx_exec import execute_onnx from finn.core.rtlsim_exec import rtlsim_exec From a0e56399447c3770001dbe0a1fdc9004b317b3de Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Wed, 31 Jan 2024 14:09:17 +0000 Subject: [PATCH 082/291] [CustomOp] rtl threshold must inherit abstraction and rtlbackend --- src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 63abdd1545..30671423d0 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -38,7 +38,8 @@ roundup_to_integer_multiple, ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.thresholding import Thresholding from finn.util.basic import ( find_next_power_of_2, get_memutil_alternatives, @@ -68,8 +69,7 @@ This module creates an RTL IP, HLS is not supported. See 'thresholding_batch' for a HLS equivalent. """ - - +class Thresholding_rtl(Thresholding, RTLBackend): class Thresholding_rtl(HLSCustomOp): """Class that corresponds to finn-rtllib 'thresholding' function.""" From 33ed7408abcd206522c814cc27343c26e2b23785 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Wed, 31 Jan 2024 14:13:03 +0000 Subject: [PATCH 083/291] [CustomOp] Remove duplicate inherited functions and attributes from thresholing --- .../fpgadataflow/rtl/thresholding_rtl.py | 75 +------------------ 1 file changed, 4 insertions(+), 71 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 30671423d0..1119461c39 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -69,8 +69,8 @@ This module creates an RTL IP, HLS is not supported. See 'thresholding_batch' for a HLS equivalent. """ + class Thresholding_rtl(Thresholding, RTLBackend): -class Thresholding_rtl(HLSCustomOp): """Class that corresponds to finn-rtllib 'thresholding' function.""" def __init__(self, onnx_node, **kwargs): @@ -78,26 +78,6 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): my_attrs = { - # parallelization; channels thresholded per cycle - "PE": ("i", True, 0), - # number of channels (each may have different thresholds) - "NumChannels": ("i", True, 0), - # number of steps in thresholding function. Used only in decoupled mode - "numSteps": ("i", True, 1), - # FINN DataTypes for inputs, outputs - "inputDataType": ("s", True, ""), - "weightDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - # name of the top module in verilog template. Used by PyVerilator - # and IPI generation - "gen_top_module": ("s", False, ""), - # bias to be applied to outputs of the node - "activation_bias": ("i", False, 0), # whether weights (thresholds) will be # writable through an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. @@ -113,7 +93,8 @@ def get_nodeattr_types(self): # setting to 0 may save some FFs but otherwise leave on "deep_pipeline": ("i", False, 1, {0, 1}), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(Thresholding.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs def get_pe_mem_geometries(self): @@ -158,10 +139,6 @@ def calc_tmem(self): pe = self.get_nodeattr("PE") return num_channels // pe - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - return super().make_const_shape_op(oshape) - def infer_node_datatype(self, model): """Used for FINN DataType inference: set the output tensors' datatypes accordingly for this node""" @@ -391,7 +368,7 @@ def prepare_codegen_rtl_values(self, model): # Identify the module variables input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision - bias = self.get_nodeattr("activation_bias") # activation bias value + bias = self.get_nodeattr("ActVal") # activation bias value i_bitwidth = DataType[input_data_type].bitwidth() code_gen_dict["$N$"] = [str(o_bitwidth)] # output precision - convert bitwidth to string @@ -489,15 +466,10 @@ def generate_hdl(self, model): # Before we return - set the 'gen_top_module' attribute for use later # by PyVerilator and IPI generation self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) - return - - def code_generation_ipgen(self, model, fpgapart, clk): - self.generate_hdl(model) # set ipgen_path and ip_path so that HLS-Synth transformation # and stich_ip transformation do not complain # i.e. during the HLSSynthIP() transformation - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") self.set_nodeattr("ipgen_path", code_gen_dir) self.set_nodeattr("ip_path", code_gen_dir) return @@ -614,13 +586,6 @@ def execute_node(self, context, graph): context[node.output[0]] = output return - def hls_sname(self): - """Get the naming convention used by Vitis HLS for stream signals - Example: the TDATA for a stream called "out" would be out_V_TDATA. - """ - # no additional prefix/suffix in interface names since this is an RTL component - return "" - def rtlsim_multi_io(self, sim, io_dict): "Run rtlsim for this node, supports multiple i/o streams." @@ -741,36 +706,4 @@ def get_dynamic_config(self, model, address_stride=1): return config - def ipgen_singlenode_code(self): - """Normally: Builds the bash script for IP generation.""" - """This is needed for the HLSSynthIP() transformation. - This is an IP, not a HLS node, so therefore provide an empty hook - to prevent any HLS synthesis.""" - pass - - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - def pragmas(self): - pass From 5f05460237666e4ba747280af0137b1dbf47c858 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 1 Feb 2024 11:14:44 +0000 Subject: [PATCH 084/291] [Test] add helper functions Signed-off-by: aziz bahri --- .../test_fpgadataflow_thresholding.py | 56 +++++++++++++++---- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index e88511f5cf..69d9a2f427 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -56,6 +56,26 @@ test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 +def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): + return np.random.randint( + input_data_type.min(), + input_data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NHWC(FINN) to NCHW(Standard) +def layout_FINN2NCHW(data): + return np.transpose(data, (0, 3, 1, 2)) + +# Convert from NCHW(Standard) to NHWC(FINN) +def layout_NCHW2FINN(data): + return np.transpose(data, (0, 2, 3, 1)) + def make_single_thresholding_modelwrapper(impl_style, T, pe, idt, odt, actval, mem_mode, n_inp_vecs): NumChannels = T.shape[0] @@ -123,27 +143,43 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ n_inp_vecs = [1, 2, 2] assert ich % pe == 0 - # generate input data + # generate input data, data layout is NHWC for FINN x = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ich])) odt = act n_steps = act.get_num_possible_values() - 1 - T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) - # provide non-decreasing thresholds - T = np.sort(T, axis=1) + + # Generate random, non-decreasing thresholds + thresholds = generate_random_threshold_values( + idt, ich, n_steps + ) + + thresholds = sort_thresholds_increasing(thresholds) if odt == DataType["BIPOLAR"]: actval = 0 else: actval = odt.min() - model = make_single_thresholding_modelwrapper(impl_style,T, pe, idt, odt, actval, mem_mode, n_inp_vecs) - - # calculate reference output + # Build DUT + model = make_single_thresholding_modelwrapper( + impl_style, + thresholds, + pe, + idt, + odt, + actval, + mem_mode, + n_inp_vecs + ) + + # Expected Reference output # multithreshold util fxn wants NCHW input, not NHWC - y = multithreshold(np.transpose(x, (0, 3, 1, 2)), T) + x_nchw = layout_FINN2NCHW(x) + y = multithreshold(x_nchw, thresholds) + # convert back to NHWC for comparison to hw outputs - y = np.transpose(y, (0, 2, 3, 1)) + y = layout_NCHW2FINN(y) if act == DataType["BIPOLAR"]: # binary to bipolar y = 2 * y - 1 @@ -157,7 +193,7 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ # package input data as dictionary input_dict = {"inp": x} - # execute model + # execute DUT y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) From ff3d60c88525f947f091c1657933a2be33f54588 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 1 Feb 2024 11:17:59 +0000 Subject: [PATCH 085/291] [Test] RTL test skip cppsim exec mode Signed-off-by: aziz bahri --- tests/fpgadataflow/test_fpgadataflow_thresholding.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 69d9a2f427..d75c9ef992 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -137,6 +137,8 @@ def make_single_thresholding_modelwrapper(impl_style, T, pe, idt, odt, actval, m @pytest.mark.vivado @pytest.mark.slow def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_mode): + if impl_style == "rtl" and exec_mode == "cppsim": + pytest.skip("rtl implstyle has no cppsim, skipping") if nf == -1: nf = ich pe = ich // nf From c54d32ce2fb619cfd231579dd2b8f0ddcf711983 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 1 Feb 2024 14:04:21 +0000 Subject: [PATCH 086/291] [Pyverilator] update to new rtlsim_multi_io implementation --- fetch-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch-repos.sh b/fetch-repos.sh index 1275ccf31c..ba7cd28a00 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -30,7 +30,7 @@ QONNX_COMMIT="47e4357faf66b5b0d1bf77bf908bb47752421e5b" FINN_EXP_COMMIT="de99347e936d51715f5356a1b6c64e37b91c23c2" BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db" -PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" +PYVERILATOR_COMMIT="fc2dd96ac07c5a23897af8f0b0339135e12fa0ba" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3" OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a" From 21103343e05dedb2eebe20940d087feea627cfb5 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 1 Feb 2024 14:12:21 +0000 Subject: [PATCH 087/291] [CustomOp] overload thresholding rtl code_generation_ipgen function --- src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 1119461c39..a539ab6f84 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -446,6 +446,9 @@ def dump_rtl_data(self, dest_dir, filename, data): f.write(data) return + def code_generation_ipgen(self, model, fpgapart, clk): + self.generate_hdl(model) + def generate_hdl(self, model): """Prepare HDL files from templates for synthesis""" # Generate a dictionary of values to put in RTL template From be5ae0277dbab87fa9d8dde2840976b9d5908428 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 1 Feb 2024 14:15:10 +0000 Subject: [PATCH 088/291] [tests] relax rtlsim cycle count match Signed-off-by: aziz bahri --- tests/fpgadataflow/test_fpgadataflow_thresholding.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index d75c9ef992..3daf44a055 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -203,6 +203,8 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ assert (y_produced == y_expected).all() model = model.transform(SpecializeLayers()) + # Make sure that SpecializeLayers did not default to HLS implementation unexpectedly + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -226,14 +228,13 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ if exec_mode == "rtlsim": hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "Thresholding_hls_0" in hls_synt_res_est - - node = model.get_nodes_by_op_type("Thresholding_hls")[0] + assert model.graph.node[0].name in hls_synt_res_est + node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) exp_cycles = exp_cycles_dict[node.name] - assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) assert exp_cycles != 0 @pytest.mark.parametrize("impl_style", ["rtl", "hls"]) @@ -265,6 +266,8 @@ def test_runtime_thresholds_single_layer(impl_style): model = make_single_thresholding_modelwrapper(impl_style, T, pe, idt, odt, actval, mem_mode, n_inp_vecs) model = model.transform(SpecializeLayers()) + + # Make sure that specialize layer did not default to HLS implementation assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) op_inst = getCustomOp(model.graph.node[0]) From f0dcec3b5375c11647d7c0177fb491f895c1f1d3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 11 Jan 2024 15:07:05 +0000 Subject: [PATCH 089/291] [hlsbackend]: update limit HLS axi streams (8k-1) --- src/finn/custom_op/fpgadataflow/hlsbackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py index 846894d85c..d8210fd684 100644 --- a/src/finn/custom_op/fpgadataflow/hlsbackend.py +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -472,5 +472,5 @@ def get_ap_int_max_w(self): instream = self.get_instream_width() outstream = self.get_outstream_width() ret = max([instream, outstream]) - assert ret <= 32768, "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret + assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret return ret From 5176eb79b90e1da206ec8aad93af3af8272043db Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:11:05 +0000 Subject: [PATCH 090/291] [mvau hls]: refactored MVAU_hls custom_op --- .../hls/matrixvectoractivation_hls.py | 522 ++++++++++++++++++ 1 file changed, 522 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py new file mode 100644 index 0000000000..2ad9fefc07 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -0,0 +1,522 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import textwrap +import warnings +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + +# ONNX i/o tensor shape assumptions for MatrixVectorActivation: +# input 0 is the input tensor, shape (.., i_size) = (..., MW) +# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) +# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) +# output 0 is the output tensor, shape (.., o_size) = (..., MH) +# the ... here can be any shape (representing groups of vectors) + + +class MatrixVectorActivation_hls(MatrixVectorActivation, HLSBackend): + """Corresponds to finn-hlslib MatrixVectorActivation_Batch function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(MatrixVectorActivation.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # fill in TSrcI and TWeightI + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["const", "decoupled", "external"]: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + def defines(self, var): + # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements. + if var == "ipgen": + SIMD = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + condition = SIMD >= (MW / 1024) + msg = ( + f"HLS synthesis of MatrixVectorActivation requires: " + f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} " + f"and MW={MW} for node: {self.onnx_node.name}." + ) + assert condition, msg + mem_mode = self.get_nodeattr("mem_mode") + numInputVectors = list(self.get_nodeattr("numInputVectors")) + numReps = np.prod(numInputVectors) + self.code_gen_dict["$DEFINES$"] = [ + """#define MW1 {}\n #define MH1 {}\n + #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n + #define TMEM1 {}\n #define numReps {}""".format( + self.get_nodeattr("MW"), + self.get_nodeattr("MH"), + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + self.calc_wmem(), + self.calc_tmem(), + numReps, + ) + ] + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + if mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } + tmpl_args = self.get_template_param_values() + if self.calc_tmem() == 0: + odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() + threshs = "PassThroughActivation<%s>()" % odtype_hls_str + else: + threshs = "threshs" + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Matrix_Vector_Activate_Batch + (in0_{}, out_{}, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Matrix_Vector_Activate_Stream_Batch + (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_weightstream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" + ) + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + ram_style_thresholds = self.get_nodeattr("ram_style_thresholds") + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + if mem_mode == "const": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") + ) + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() + ) + + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or external, + currently no other parameter value is supported!""" + ) + + # the threshold tensor is acc_type [PE][TMEM][N_THRES] + # partition for parallel access along PE and N_THRES + # dimensions (dims 1 and 3) + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") + ) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") + ) + # add resource pragma for thresholds if set + if ram_style_thresholds == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") + ) + elif ram_style_thresholds == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") + ) + elif ram_style_thresholds == "auto": + # no pragma needed + pass + else: + raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds) + + def get_ap_int_max_w(self): + # base class impl (max of inp/out stream widths) + max_of_io = super().get_ap_int_max_w() + # decoupled mode weight stream + weightstream = self.get_weightstream_width() + # single PE weight entry + weight_bits = self.get_weight_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + single_pe_w = simd * weight_bits + return max([weightstream, max_of_io, single_pe_w]) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for MatrixVectorActivation") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_normal_output_shape() + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + self.reset_rtlsim(sim) + self.toggle_clk(sim) + if mem_mode in ["external", "decoupled"]: + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to "rtlsim" """.format( + mode + ) + ) \ No newline at end of file From b7480bb7a98681343a55af93627b97285e5e1e11 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:45:12 +0000 Subject: [PATCH 091/291] [refactor]: call to base_op_type method instead of custom_op type --- src/finn/analysis/fpgadataflow/res_estimation.py | 2 +- src/finn/transformation/fpgadataflow/create_stitched_ip.py | 3 ++- src/finn/transformation/fpgadataflow/floorplan.py | 2 +- src/finn/transformation/fpgadataflow/insert_dwc.py | 2 +- src/finn/transformation/fpgadataflow/insert_iodma.py | 2 +- src/finn/transformation/fpgadataflow/insert_tlastmarker.py | 4 ++-- src/finn/transformation/fpgadataflow/make_pynq_driver.py | 2 +- src/finn/transformation/fpgadataflow/make_zynq_proj.py | 2 +- src/finn/transformation/fpgadataflow/set_fifo_depths.py | 6 +++--- src/finn/transformation/fpgadataflow/set_folding.py | 2 +- 10 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index be4cf417bc..a7f220daa9 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -60,8 +60,8 @@ def res_estimation_complete(model): res_dict = {} for node in model.graph.node: if is_fpgadataflow_node(node) is True: - op_type = node.op_type inst = registry.getCustomOp(node) + op_type = inst.base_op_type() if op_type == "MatrixVectorActivation" or op_type == "VectorVectorActivation": orig_restype = inst.get_nodeattr("resType") res_dict[node.name] = [] diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 1a182c7f4f..81c5848d57 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -48,12 +48,13 @@ def is_external_input(model, node, i): # True only if input is unconnected and has no initializer # Only esception is second input of FC layers when mem_mode is external node_inst = getCustomOp(node) + op_type = node_inst.base_op_type() producer = model.find_producer(node.input[i]) if producer is None: if model.get_initializer(node.input[i]) is None: return True else: - if node.op_type == "MatrixVectorActivation": + if op_type == "MatrixVectorActivation": if node_inst.get_nodeattr("mem_mode") == "external": return True return False diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index ceb2bdb5c9..56e644f2b8 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -150,7 +150,7 @@ def apply(self, model): continue elif not ( - node.op_type == "MatrixVectorActivation" + node_inst.base_op_type() == "MatrixVectorActivation" and node_inst.get_nodeattr("mem_mode") is not None and node_inst.get_nodeattr("mem_mode") == "external" ): diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 81cee8dae4..d0029cb630 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -88,7 +88,7 @@ def apply(self, model): # - if FC and external mem, it could be connected to input 1 # - if concat, could be connected to any input if ( - consumer.op_type == "MatrixVectorActivation" + n1.base_op_type() == "MatrixVectorActivation" and n1.get_nodeattr("mem_mode") == "external" ) or (consumer.op_type == "StreamingConcat"): # get input idx diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 93e3226b2a..fd546459fa 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -199,7 +199,7 @@ def apply(self, model): # attached IODMA fc_extw_nodes = list( filter( - lambda x: x.op_type in ["MatrixVectorActivation", "VectorVectorActivation"] + lambda x: getCustomOp(x).base_op_type() in ["MatrixVectorActivation", "VectorVectorActivation"] and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 157df46d71..ab5142e4d8 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -103,7 +103,7 @@ def apply(self, model): # the input is in the list of graph inputs because it has an # initializer (TODO: fix this with a clean-up transform) if ( - first_node.op_type == "MatrixVectorActivation" + getCustomOp(first_node).base_op_type() == "MatrixVectorActivation" and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8") != "external" ): @@ -117,7 +117,7 @@ def apply(self, model): num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) inp_idx = list(first_node.input).index(graph_in_name) if inp_idx > 0: - if first_node.op_type == "MatrixVectorActivation" and inp_idx == 1: + if getCustomOp(first_node).base_op_type() == "MatrixVectorActivation" and inp_idx == 1: stream_width = int(custom_op.get_weightstream_width()) elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1: stream_width = int(custom_op.get_instream_width()) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index d5c2d8f2b5..e66236bf39 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -282,7 +282,7 @@ def apply(self, model): dataflow_model = ModelWrapper(dataflow_model_filename) rt_layer_ind = 0 for node in dataflow_model.graph.node: - if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]: + if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch": node_inst = getCustomOp(node) is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") if is_rt_weights == 1: diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 989eb62a88..193e6e8b42 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -62,7 +62,7 @@ def collect_ip_dirs(model, ipstitch_path): ), """The directory that should contain the generated ip blocks doesn't exist.""" ip_dirs += [ip_dir_value] - if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]: + if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch": if node_inst.get_nodeattr("mem_mode") == "decoupled": need_memstreamer = True ip_dirs += [ipstitch_path + "/ip"] diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 11ffc965b6..84a8084832 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -174,7 +174,7 @@ def apply(self, model): continue if fifo_cons is None: continue - if fifo_cons.op_type != "MatrixVectorActivation": + if getCustomOp(fifo_cons).base_op_type() != "MatrixVectorActivation": continue op_inst = getCustomOp(node) depth = op_inst.get_nodeattr("depth") @@ -281,7 +281,7 @@ def apply(self, model): node.set_nodeattr("inFIFODepths", ifd) node.set_nodeattr("outFIFODepths", ofd) - if node.onnx_node.op_type in extw_optypes: + if getCustomOp(node).base_op_type() in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) @@ -422,7 +422,7 @@ def apply(self, model): # (removed setting of node FIFO size attributes to 0 here) # for every extw node we changed from external to decoupled, # change back and reset implementation - if node.op_type in extw_optypes: + if getCustomOp(node).base_op_type() in extw_optypes: if node.name in modified_fc_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("mem_mode", "external") diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 4045a28e16..7b65023abc 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -125,7 +125,7 @@ def apply(self, model): continue op_type = node.op_type node_inst = getCustomOp(node) - if op_type == "MatrixVectorActivation": + if node_inst.base_op_type() == "MatrixVectorActivation": max_simd = node_inst.get_nodeattr("MW") max_pe = node_inst.get_nodeattr("MH") node_inst.set_nodeattr("PE", 1) From 4f707d8308e6b6e1cb90d0f068d6536017cc9d40 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:46:01 +0000 Subject: [PATCH 092/291] [hls custom-op]: add mvau_hls --- src/finn/custom_op/fpgadataflow/hls/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 188f45273c..1f1448b9fc 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -50,6 +50,7 @@ from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls +from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MatrixVectorActivation_hls custom_op = dict() @@ -75,3 +76,4 @@ custom_op["Thresholding_hls"] = Thresholding_hls custom_op["TLastMarker_hls"] = TLastMarker_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls +custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls \ No newline at end of file From 7c6065c12f16ac250aee2c9717a6a54d95d52cea Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:46:17 +0000 Subject: [PATCH 093/291] [hw custom-op]: refactor MVAU --- .../fpgadataflow/matrixvectoractivation.py | 820 ++++++------------ 1 file changed, 274 insertions(+), 546 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 6699340cac..7cf6c2b2cd 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -31,20 +31,32 @@ import os import textwrap import warnings +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +import qonnx.custom_op.general.xnorpopcount as xp +from qonnx.custom_op.general.multithreshold import multithreshold +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp from qonnx.util.basic import ( calculate_matvec_accumulator_range, interleave_matrix_outer_dim_from_partitions, roundup_to_integer_multiple, + qonnx_make_model ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) +import qonnx.core.data_layout as DataLayout +import finn.core.onnx_exec as oxe +from qonnx.transformation.infer_shapes import InferShapes +import onnx.numpy_helper as np_helper +from qonnx.transformation.general import GiveUniqueNodeNames + # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -54,9 +66,8 @@ # the ... here can be any shape (representing groups of vectors) -class MatrixVectorActivation(HLSCustomOp): - """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch - function.""" +class MatrixVectorActivation(HWCustomOp): + """Abstraction layer for HW implementation of MatrixVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -122,10 +133,14 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), - } + "preferred_impl_style" : ("s", False, "hls", {"hls", "rtl"}), + } my_attrs.update(super().get_nodeattr_types()) return my_attrs + def base_op_type(self): + return "MatrixVectorActivation" + def calc_wmem(self): """Calculates and returns WMEM.""" mw = self.get_nodeattr("MW") @@ -165,6 +180,61 @@ def infer_node_datatype(self, model): odt = self.get_output_datatype() model.set_tensor_datatype(node.output[0], odt) + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + # when performing FIFO insertion on an FC layer with ext weights, the ind + # parameter can be > 0 (referring to the weights) so handle that here + if ind == 0: + return DataType[self.get_nodeattr("inputDataType")] + elif ind == 1: + return DataType[self.get_nodeattr("weightDataType")] + else: + raise Exception("Undefined input ind for this layer type") + + def get_weight_datatype(self): + """Returns FINN DataType of weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + assert ( + i_bits <= 9 + ), "RTL-based MVAU only supports activations with bit-width up to 9-bits" + in_width = i_bits * self.get_nodeattr("SIMD") + return in_width + + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wp = self.get_weight_datatype().bitwidth() + assert ( + wp <= 8 + ), "RTL-based MVAU only supports weights with bit-width up to 8-bits" + w_width = pe * simd * wp + return w_width + else: + return 0 + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + out_width = o_bits * self.get_nodeattr("PE") + return out_width + + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -385,6 +455,25 @@ def dsp_estimation(self): else: mult_dsp = 0 return int(mult_dsp) +# # TODO: fix DSP estimations --> depends on fpga_part +# def dsp_estimation(self): +# # multiplication +# # mvu_8sx9 (DSP58): ceil(SIMD/3) +# # mvu_4sx4u (DSP48/DSP58): ceil(PE/4) +# # mvu_8sx8u (DSP48): ceil(PE/2) +# # mvu_lut: 0 +# P = self.get_nodeattr("PE") +# res_type = self.get_nodeattr("resType") +# Q = self.get_nodeattr("SIMD") +# wdt = self.get_weight_datatype() +# W = wdt.bitwidth() +# idt = self.get_input_datatype() +# A = idt.bitwidth() +# if res_type == "dsp": +# mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling +# else: +# mult_dsp = 0 +# return int(mult_dsp) def get_exp_cycles(self): pe = self.get_nodeattr("PE") @@ -397,6 +486,27 @@ def get_exp_cycles(self): exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) +# # TODO: fix exp_cycles estimations --> depends on fpga_part and clk +# def get_exp_cycles(self): +# # mvu_8sx9 (DSP58): +# # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice) +# # + MW/SIMD * MH/PE +# # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): +# # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane) +# # + MW/SIMD * MH/PE +# # mvu_lut: +# # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) +# # + MW/SIMD * MH/PE +# pe = self.get_nodeattr("PE") +# simd = self.get_nodeattr("SIMD") +# num_inp_vec = self.get_nodeattr("numInputVectors") +# mh = self.get_nodeattr("MH") +# mw = self.get_nodeattr("MW") +# # since mmv != 1 is not supported yet, we set mmv for now to 1 +# mmv = 1 +# exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv +# return int(exp_cycles) + def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" # when performing FIFO insertion on an FC layer with ext weights, the ind @@ -450,17 +560,6 @@ def get_weightstream_width_padded(self): weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8) - def get_ap_int_max_w(self): - # base class impl (max of inp/out stream widths) - max_of_io = super().get_ap_int_max_w() - # decoupled mode weight stream - weightstream = self.get_weightstream_width() - # single PE weight entry - weight_bits = self.get_weight_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - single_pe_w = simd * weight_bits - return max([weightstream, max_of_io, single_pe_w]) - def get_folded_input_shape(self, ind=0): mw = self.get_nodeattr("MW") mh = self.get_nodeattr("MH") @@ -505,82 +604,6 @@ def get_number_output_values(self): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - # out_is_binary = self.get_output_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): - raise Exception("True binary (non-bipolar) inputs not yet supported") - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # fill in TSrcI and TWeightI - # TODO check these with Giulio - # TODO handle non-bipolar binary inputs - if inp_is_bipolar and wt_is_bipolar: - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and wt_is_bipolar: - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Recast" - elif inp_is_bipolar and (not wt_is_bipolar): - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and (not wt_is_bipolar): - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Identity" - - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str - - return ret - - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 and MW % SIMD == 0 - * for bipolar {-1,+1} weights, convert to binary {0, 1} - * interleave rows between PEs - * reshape into (1, PE, WMEM, SIMD) and return - """ - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wmem = self.calc_wmem() - assert orig_weight_matrix.shape == ( - mw, - mh, - ), """Weights matrix doesn't - have expected shape (mw, mh)""" - assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - # start by transposing the original weight matrix, since ONNX and - # finn-hlslib use different assumptions - # ONNX uses (in_features, out_features) and matmul(x, W) - # finn-hlslib uses (out_features, in_features) and matmul(W, x) - ret = orig_weight_matrix.T - if self.get_weight_datatype() == DataType["BIPOLAR"]: - # convert bipolar to binary - ret = (ret + 1) / 2 - # interleave rows between PEs and reshape - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - # create SIMD as innermost dimension and add a dummy outer dim - ret = ret.reshape(1, pe, wmem, simd) - # reverse the SIMD dimension - ret = np.flip(ret, axis=-1) - return ret - def minimize_accumulator_width(self, model): """Minimize the accumulator bit width according to the weight values, input data types, and size of dot product""" @@ -728,6 +751,43 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) + def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 and MW % SIMD == 0 + * for bipolar {-1,+1} weights, convert to binary {0, 1} + * interleave rows between PEs + * reshape into (1, PE, WMEM, SIMD) and return + """ + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + mw, + mh, + ), """Weights matrix doesn't + have expected shape (mw, mh)""" + assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + # start by transposing the original weight matrix, since ONNX and + # finn-hlslib use different assumptions + # ONNX uses (in_features, out_features) and matmul(x, W) + # finn-hlslib uses (out_features, in_features) and matmul(W, x) + ret = orig_weight_matrix.T + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 + # interleave rows between PEs and reshape + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + # create SIMD as innermost dimension and add a dummy outer dim + ret = ret.reshape(1, pe, wmem, simd) + # reverse the SIMD dimension + ret = np.flip(ret, axis=-1) + return ret + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig @@ -905,402 +965,68 @@ def generate_params(self, model, path): f_thresh.write(thresholds_hls_code) f_thresh.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - mem_mode = self.get_nodeattr("mem_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for MatrixVectorActivation") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - # reinterpret binary output as bipolar where needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == self.get_normal_output_shape() - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - if mem_mode == "external" or mem_mode == "decoupled": - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - # we have converted bipolar weights to binary for export, - # so use it as such for weight generation - if self.get_weight_datatype() == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] - - def defines(self, var): - # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements. - if var == "ipgen": - SIMD = self.get_nodeattr("SIMD") - MW = self.get_nodeattr("MW") - condition = SIMD >= (MW / 1024) - msg = ( - f"HLS synthesis of MatrixVectorActivation requires: " - f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} " - f"and MW={MW} for node: {self.onnx_node.name}." - ) - assert condition, msg - mem_mode = self.get_nodeattr("mem_mode") - numInputVectors = list(self.get_nodeattr("numInputVectors")) - numReps = np.prod(numInputVectors) - self.code_gen_dict["$DEFINES$"] = [ - """#define MW1 {}\n #define MH1 {}\n - #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n - #define TMEM1 {}\n #define numReps {}""".format( - self.get_nodeattr("MW"), - self.get_nodeattr("MH"), - self.get_nodeattr("SIMD"), - self.get_nodeattr("PE"), - self.calc_wmem(), - self.calc_tmem(), - numReps, - ) - ] - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - elem_bits = wdt.bitwidth() - packed_bits = self.get_weightstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = wdt.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/weights.npy" % code_gen_dir - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - - if mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights_{} ("weights_{}");'.format( - self.get_weightstream_width(), self.hls_sname(), self.hls_sname() - ) - ) + def get_op_and_param_counts(self): + in_features = self.get_nodeattr("MW") + out_features = self.get_nodeattr("MH") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_inp_vec = self.get_nodeattr("numInputVectors") + num_repetitions = int(np.prod(num_inp_vec)) + mac_count = in_features * out_features * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = in_features * out_features + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = out_features + ret_dict[thres_param_type] = thres_count + return ret_dict - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - map_to_hls_mult_style = { - "auto": "ap_resource_dflt()", - "lut": "ap_resource_lut()", - "dsp": "ap_resource_dsp()", + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, } - tmpl_args = self.get_template_param_values() - if self.calc_tmem() == 0: - odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() - threshs = "PassThroughActivation<%s>()" % odtype_hls_str - else: - threshs = "threshs" - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Matrix_Vector_Activate_Batch - (in0_{}, out_{}, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - if wdt == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - else: - export_wdt = wdt - wdtype_hls_str = export_wdt.get_hls_datatype_str() - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Matrix_Vector_Activate_Stream_Batch - (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - wdtype_hls_str, - self.hls_sname(), - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}( - hls::stream> &in0_{}, - hls::stream> &weights_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_weightstream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - - else: - raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" - ) - - def pragmas(self): mem_mode = self.get_nodeattr("mem_mode") - ram_style_thresholds = self.get_nodeattr("ram_style_thresholds") - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - if mem_mode == "const": - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() - ) + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + def execute_node(self, context, graph): + node = self.onnx_node + in_act = context[node.input[0]] + mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] + mvau_w = np_helper.to_array(mvau_w_init) + # Matrix multiplication + if self.get_nodeattr("binaryXnorMode"): + # Note: activation/weights are expected to be binary (by design coming from the transformation inferring this operation mode) + result = xp.xnorpopcountmatmul(in_act, mvau_w) + elif (self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR"): + result = xp.xnorpopcountmatmul((in_act+1)/2, (mvau_w+1)/2) else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or external, - currently no other parameter value is supported!""" - ) - - # the threshold tensor is acc_type [PE][TMEM][N_THRES] - # partition for parallel access along PE and N_THRES - # dimensions (dims 1 and 3) - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") - ) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") - ) - # add resource pragma for thresholds if set - if ram_style_thresholds == "distributed": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") - ) - elif ram_style_thresholds == "block": - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") - ) - elif ram_style_thresholds == "auto": - # no pragma needed - pass - else: - raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds) + result = np.matmul(in_act, mvau_w) + # Thresholding if noActivation==0 + if self.get_nodeattr("noActivation") == 0: + mvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] + mvau_thr = np_helper.to_array(mvau_thr_init) + odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"] + out_scale = 2 if odt_is_bipolar else 1 + out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") + result = multithreshold(result, mvau_thr, out_scale, out_bias) + + context[node.output[0]] = result def code_generation_ipi(self): cmd = [] @@ -1324,22 +1050,51 @@ def code_generation_ipi(self): cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) cmd.append( "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" + % (node_name, dout_name) ) cmd.append( "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + is_rtl_op = self.__class__.__name__ == "MatrixVectorActivation_rtl" + if is_rtl_op: + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + else: + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst) + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (strm_vlnv, node_name, strm_inst) ) cmd.append( "set_property -dict [list " @@ -1393,7 +1148,8 @@ def code_generation_ipi(self): axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] cmd.append( "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name) + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" + % (node_name, axilite_name) ) cmd.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " @@ -1404,60 +1160,32 @@ def code_generation_ipi(self): cmd.append("assign_bd_address") cmd.append("save_bd_design") elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes - return super().code_generation_ipi() + if is_rtl_op and mem_mode == "external": + # instantiate the RTL block + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join( + code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" + ), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + ) + ) + else: + # base class impl sufficient for const/external modes + return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") - return cmd - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - mem_mode = self.get_nodeattr("mem_mode") - sname = self.hls_sname() - if mem_mode == "external": - intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names - - def get_op_and_param_counts(self): - in_features = self.get_nodeattr("MW") - out_features = self.get_nodeattr("MH") - weight_bits = self.get_weight_datatype().bitwidth() - inp_bits = self.get_input_datatype().bitwidth() - num_inp_vec = self.get_nodeattr("numInputVectors") - num_repetitions = int(np.prod(num_inp_vec)) - mac_count = in_features * out_features * num_repetitions - # cannonicalize op type: highest bitwidth operand first s.t. - # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types - bw1 = min(inp_bits, weight_bits) - bw2 = max(inp_bits, weight_bits) - mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) - weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = in_features * out_features - ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = out_features - ret_dict[thres_param_type] = thres_count - return ret_dict - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + return cmd \ No newline at end of file From 0cb2d594c1a67abff4167c3dfa8f1c34b1f612f6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:46:51 +0000 Subject: [PATCH 094/291] [VVAU hw custom-op]: add base_op_type method --- src/finn/custom_op/fpgadataflow/vectorvectoractivation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index bd5bb75f1d..891730ece3 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -104,6 +104,9 @@ def get_nodeattr_types(self): my_attrs.update(super().get_nodeattr_types()) return my_attrs + def base_op_type(self): + return "VectorVectorActivation" + def minimize_accumulator_width(self, model): """Minimize the accumulator bit width according to the weight values, input data types, and size of dot product""" From 627639ab09b708861db05a97bda2d544ed314d65 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 11:47:27 +0000 Subject: [PATCH 095/291] [transform]: add transformation to infer MVAU hw custom-op --- .../fpgadataflow/convert_to_hw_layers.py | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index d1d61f0ed5..eb6dd337f5 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1280,3 +1280,139 @@ def apply(self, model): graph_modified = True return (model, graph_modified) + +class InferQuantizedMatrixVectorActivation(Transformation): + """Convert MatMul layers with quantized inputs and weights to + MatrixVectorActivation layers.""" + + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None: + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + idt = model.get_tensor_datatype(mm_input) + wdt = model.get_tensor_datatype(mm_weight) + if idt.is_integer() and wdt.is_integer(): + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # extract weight shape, note that ONNX and finn-hlslib + # make different assumptions about dim order here + # ONNX assumes W has (in, out) shape + # finn-hlslib assumes W has (out, in) shape + mh = int(W.shape[1]) + mw = int(W.shape[0]) + # create node with no parallelization first + pe = 1 + simd = 1 + wmem = mw * mh // (pe * simd) + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisible by + (WMEM * PE * SIMD) is violated.""" + ) + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # TODO ensure integer thresholds? + # create MVTU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor MH.""" + ) + odt = model.get_tensor_datatype(mt_output) + scale = getCustomOp(consumer).get_nodeattr("out_scale") + actval = getCustomOp(consumer).get_nodeattr("out_bias") + assert int(actval) == actval, ( + consumer.name + ": out_bias must be integer for HLS conversion." + ) + actval = int(actval) + odt_is_bipolar = odt == DataType["BIPOLAR"] + bipolar_ok = odt_is_bipolar and (scale == 2.0) and (actval == -1) + assert scale == 1.0 or bipolar_ok, ( + consumer.name + ": out_scale=1 or bipolar output needed for conversion." + ) + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + if bipolar_ok: + # remove bias for bipolar, since + # binary->bipolar is achieved by reinterpretation + actval = 0 + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=0, + noActivation=0, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name="MatrixVectorActivation_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + binaryXnorMode=0, + noActivation=1, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name="MatrixVectorActivation_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) \ No newline at end of file From cd3d431331a0a9afa41ad3cbe3f721529e8bd1f2 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 26 Jan 2024 12:46:06 +0000 Subject: [PATCH 096/291] [test mvau]: modified to support new custom-ops --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 120 +++++++++++++++++-- 1 file changed, 113 insertions(+), 7 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index b80ef76a19..bd283855e3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -52,6 +52,9 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames +from qonnx.transformation.infer_shapes import InferShapes +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None): @@ -135,6 +138,87 @@ def prepare_inputs(input_tensor, idt, wdt): return {"inp": input_tensor} +# activation: None or DataType +@pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["INT4"]]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]]) +# neuron folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [-1, 2, 1]) +# synapse folding, -1 is maximum possible +@pytest.mark.parametrize("sf", [-1, 2, 1]) +# HLS matrix width (input features) +@pytest.mark.parametrize("mw", [16]) +# HLS matrix height (output features) +@pytest.mark.parametrize("mh", [16]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_fclayer_hwop(idt, wdt, act, nf, sf, mw, mh): + if nf == -1: + nf = mh + if sf == -1: + sf = mw + pe = mh // nf + simd = mw // sf + assert mh % pe == 0 + assert mw % sf == 0 + # generate weights + W = gen_finn_dt_tensor(wdt, (mw, mh)) + # generate input data + x = gen_finn_dt_tensor(idt, (1, mw)) + if act is None: + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + else: + odt = act + (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw) + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T = np.sort(T, axis=1) + # generate thresholds for activation + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + tdt = DataType["UINT32"] + # bias thresholds to be positive + T = np.ceil((T + mw) / 2) + assert (T >= 0).all() + else: + tdt = DataType["INT32"] + model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + # prepare input data + input_dict = prepare_inputs(x, idt, wdt) + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + # convert inputs to binary and use xnorpopcountmatmul + y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2) + else: + y = np.matmul(x, W) + if T is not None: + # y = multithreshold(y, T) + if act == DataType["BIPOLAR"]: + # binary to bipolar + # y = 2 * y - 1 + y = multithreshold(y, T, 2, -1) + else: + # signed offset + # y += act.min() + y = multithreshold(y, T, 1, act.min()) + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all(), "cppsim hw-op failed" + + # mem_mode: const or decoupled @pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"]) # activation: None or DataType @@ -154,7 +238,9 @@ def prepare_inputs(input_tensor, idt, wdt): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_fclayer_hlsop_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): + if idt == DataType["BIPOLAR"] and wdt != DataType["BIPOLAR"] or idt != DataType["BIPOLAR"] and wdt == DataType["BIPOLAR"]: + pytest.skip("Bipolar activations/weights only supported in MVU if both operands are bipolar") if nf == -1: nf = mh if sf == -1: @@ -195,6 +281,8 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -220,7 +308,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_produced = y_produced.reshape(y_expected.shape) - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all(), "cppsim hls-op failed" # mem_mode: const or decoupled @@ -239,10 +327,14 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.parametrize("mw", [16]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [16]) +# Backend +@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend): + if backend == "rtl" and act is not None: + pytest.skip("RTL MVU doesn't support embedded thresholding functionality.") if nf == -1: nf = mh if sf == -1: @@ -283,6 +375,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("preferred_impl_style", backend) # prepare input data input_dict = prepare_inputs(x, idt, wdt) @@ -303,6 +396,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -312,7 +406,10 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_0" in hls_synt_res_est + if backend == "hls": + assert "MatrixVectorActivation_hls_0" in hls_synt_res_est + else: + assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] inst = getCustomOp(node) @@ -339,10 +436,12 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.parametrize("mw", [128]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [128]) +# Backend +@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( - mem_mode, idt, wdt, act, nf, sf, mw, mh + mem_mode, idt, wdt, act, nf, sf, mw, mh, backend ): if nf == -1: nf = mh @@ -404,6 +503,7 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -413,7 +513,10 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_0" in hls_synt_res_est + if backend == "hls": + assert "MatrixVectorActivation_hls_0" in hls_synt_res_est + else: + assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] inst = getCustomOp(node) @@ -440,9 +543,11 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( @pytest.mark.parametrize("mw", [32]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [32]) +# Backend +@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend): if nf == -1: nf = mh if sf == -1: @@ -469,6 +574,7 @@ def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh inst.set_nodeattr("mem_mode", mem_mode) total_fold = nf * sf exp_total_cycles = total_fold + 10 + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) From 0348a7c54b29432751e2098670c939b95522be35 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 1 Feb 2024 14:34:15 +0000 Subject: [PATCH 097/291] [vvau hls]: add custom op to dict --- src/finn/custom_op/fpgadataflow/hls/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 1f1448b9fc..ebb5ce98da 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -51,6 +51,7 @@ from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MatrixVectorActivation_hls +from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VectorVectorActivation_hls custom_op = dict() @@ -76,4 +77,5 @@ custom_op["Thresholding_hls"] = Thresholding_hls custom_op["TLastMarker_hls"] = TLastMarker_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls -custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls \ No newline at end of file +custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls +custom_op["VectorVectorActivation_hls"] = VectorVectorActivation_hls \ No newline at end of file From b2c10d899ceeb2dc29c50e823d343a5f8c52a53e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 1 Feb 2024 14:35:56 +0000 Subject: [PATCH 098/291] [vvu hw-op]: refactored hw custom-op VVAU --- .../fpgadataflow/vectorvectoractivation.py | 1196 ++++++----------- 1 file changed, 423 insertions(+), 773 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index 891730ece3..2168474298 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -38,17 +38,21 @@ roundup_to_integer_multiple, ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, pack_innermost_dim_as_hex_string, rtlsim_output_to_npy, ) +import onnx.numpy_helper as np_helper +import qonnx.custom_op.general.xnorpopcount as xp +from qonnx.custom_op.general.multithreshold import multithreshold -class VectorVectorActivation(HLSCustomOp): - """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" + +class VectorVectorActivation(HWCustomOp): + """Abstraction layer for HW implementation of VectorVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) @@ -100,6 +104,10 @@ def get_nodeattr_types(self): # use xnor-popcount for binary weights/inputs, thus treating them # as bipolar "binaryXnorMode": ("i", False, 0, {0, 1}), + # Backend implementation for layer + # hls -- Vivado HLS + # rtl -- (System)Verilog + "preferred_impl_style": ("s", False, "hls", {"hls", "rtl"}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -107,124 +115,55 @@ def get_nodeattr_types(self): def base_op_type(self): return "VectorVectorActivation" - def minimize_accumulator_width(self, model): - """Minimize the accumulator bit width according to the weight values, - input data types, and size of dot product""" - weights = model.get_initializer(self.onnx_node.input[1]) - k_h, k_w = self.get_nodeattr("Kernel") - fm = self.get_nodeattr("Channels") - # put weights into the shape expected by calculate_matvec_accumulator_range - weights = weights.reshape(fm, k_h * k_w).transpose() - # since in the calculation the values of the weight matrix are used, - # for the bipolar case they need to be converted to bipolar - if self.get_nodeattr("binaryXnorMode"): - weights = 2 * weights - 1 - if len(self.onnx_node.input) > 2: - thresholds = model.get_initializer(self.onnx_node.input[2]) - else: - thresholds = None - idt = self.get_input_datatype() - - (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) - # if runtime-writeable weights, then the values of the weights can - # change and we need to use the worst-case values from the datatypes - if self.get_nodeattr("runtime_writeable_weights"): - wdt = self.get_weight_datatype() - lower_worst = wdt.min() * np.ones_like(weights) - lower_range = calculate_matvec_accumulator_range(lower_worst, idt) - upper_worst = wdt.max() * np.ones_like(weights) - upper_range = calculate_matvec_accumulator_range(upper_worst, idt) - acc_min = min(min(lower_range), min(upper_range)) - acc_max = max(max(upper_range), max(upper_range)) - - # if the thresholds can be used to determine range, then adjust the range - # according to the known values of the thresholds - if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - # set threshold datatype (and accumulator datatype implicitly) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - # clip threshold values - if max_threshold > acc_max or min_threshold < acc_min: - warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) - thresholds = np.clip(thresholds, acc_min, acc_max) - model.set_initializer(self.onnx_node.input[2], thresholds) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - acc_min = min(min_threshold, acc_min) - acc_max = max(max_threshold, acc_max) + def _infer_sparse_weight_tensor(self, W_conv, k_h, k_w, channels): + W_sparse = np.zeros((channels, channels, k_h, k_w), dtype=np.float32) + for ch in range(channels): + W_sparse[ch][ch] = W_conv[ch][0] + W_conv = W_sparse.astype(np.float32) + W_matmul = W_conv.transpose(0, 2, 3, 1) + W_matmul = W_matmul.reshape(channels, channels * k_h * k_w) + W_matmul = W_matmul.T + return W_matmul - # if the acc_range is always greater than 0, then acc_max <= 2^P - 1 - if acc_min >= 0: - acc_bit_width = np.log2(acc_max + 1) - acc_bit_width = math.ceil(acc_bit_width) - adt = DataType[f"UINT{acc_bit_width}"] - # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <= - # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max) + def execute_node(self, context, graph): + node = self.onnx_node + in_act = context[node.input[0]] + (_, dim_h, dim_w, _) = in_act.shape + (k_h, k_w) = self.get_nodeattr("Kernel") + channels = self.get_nodeattr("Channels") + # Reshape input activations in right format + in_act = in_act.reshape(1, dim_h, dim_w, channels, k_h*k_w) + in_act = in_act.transpose(0, 1, 2, 4, 3) + in_act = in_act.reshape(1, dim_h, dim_w, channels*k_h*k_w) + # Reshape + vvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] + vvau_w = np_helper.to_array(vvau_w_init) + vvau_w_onnx = self._infer_sparse_weight_tensor(vvau_w, k_h, k_w, channels) + + if self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR": + result = np.matmul(in_act, vvau_w_onnx) + result = (result + k_h*k_w) / 2 else: - _acc_max = max(-acc_min, 1 + acc_max) - acc_bit_width = np.log2(_acc_max) + 1 - acc_bit_width = math.ceil(acc_bit_width) - adt = DataType[f"INT{acc_bit_width}"] - - # if activation, assert that the thresholds can be expressed with adt - if thresholds is not None: - assert np.vectorize(adt.allowed)( - threshold_tensor - ).all(), "Thresholds in %s can't be expressed with type %s" % ( - self.onnx_node.name, - str(adt), - ) - - # if no activation, output and accumulator datatypes are the same - if self.get_nodeattr("noActivation"): - # if this is the last node in the graph, then ensure the datatype is - # divisibly by 8 bits - if model.find_direct_successors(self.onnx_node) is None: - bw = roundup_to_integer_multiple(adt.bitwidth(), 8) - new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) - adt = DataType[new_adt_name] - # for no-activation nodes, output dt = acc dt - self.set_nodeattr("outputDataType", adt.name) - self.set_nodeattr("accDataType", adt.name) - - return DataType[self.get_nodeattr("accDataType")] - - def minimize_weight_bit_width(self, model): - """Minimize the bit width based on the values of the weights""" - if not self.get_nodeattr("runtime_writeable_weights"): - weights = model.get_initializer(self.onnx_node.input[1]) - w_min = weights.min() - w_max = weights.max() - if w_min < 0: - if abs(w_min) > w_max: - wdt = DataType.get_smallest_possible(w_min) - else: - wdt = DataType.get_smallest_possible(-w_max - 1) - else: - wdt = DataType.get_smallest_possible(w_max) - self.set_nodeattr("weightDataType", wdt.name) - return DataType[self.get_nodeattr("weightDataType")] - - def calc_wmem(self): - """Calculates and returns WMEM.""" - ch = self.get_nodeattr("Channels") - k_h, k_w = self.get_nodeattr("Kernel") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wmem = (k_h * k_w * ch // pe) // simd - return wmem + result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format - def calc_tmem(self): - """Calculates and returns TMEM.""" - if self.get_nodeattr("noActivation") == 1: - return 0 - else: - ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - return ch // pe + if self.get_nodeattr("noActivation") == 0: + vvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] + vvau_thr = np_helper.to_array(vvau_thr_init) + odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"] + out_scale = 2 if odt_is_bipolar else 1 + out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") + # NHWC to NCHW for multithreshold node + result = result.transpose((0,3,1,2)) + result = multithreshold(result, vvau_thr, out_scale, out_bias) + # NCHW to NHWC + result = result.transpose((0,2,3,1)) + + # for i in range(self.get_nodeattr("Channels")): + context[node.output[0]] = result + def verify_node(self): + pass + def make_shape_compatible_op(self, model): oshape = self.get_normal_output_shape() return super().make_const_shape_op(oshape) @@ -244,9 +183,6 @@ def infer_node_datatype(self, model): odt = self.get_output_datatype() model.set_tensor_datatype(node.output[0], odt) - def verify_node(self): - pass - def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" return DataType[self.get_nodeattr("inputDataType")] @@ -269,12 +205,32 @@ def get_instream_width(self, ind=0): pe = self.get_nodeattr("PE") in_width = i_bits * simd * pe return in_width + + def get_weightstream_width(self): + """Returns weight stream width. Used only in decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + w_width = simd * pe * wp + return w_width + else: + return 0 def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() out_width = o_bits * self.get_nodeattr("PE") return out_width + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + def get_folded_input_shape(self, ind=0): k_h, k_w = self.get_nodeattr("Kernel") dim_h, dim_w = self.get_nodeattr("Dim") @@ -323,88 +279,302 @@ def get_number_output_values(self): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf - def get_exp_cycles(self): - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") + def calc_wmem(self): + """Calculates and returns WMEM.""" ch = self.get_nodeattr("Channels") - dim_h, dim_w = self.get_nodeattr("Dim") k_h, k_w = self.get_nodeattr("Kernel") - # currently FINN supports for vvau a batch size of 1 - batch_size = 1 - # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv - return int(exp_cycles) + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = (k_h * k_w * ch // pe) // simd + return wmem - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - # out_is_binary = self.get_output_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): - raise Exception("True binary (non-bipolar) inputs not yet supported") - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # fill in TSrcI and TWeightI - # TODO check these with Giulio - # TODO handle non-bipolar binary inputs - if inp_is_bipolar and wt_is_bipolar: - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and wt_is_bipolar: - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Recast" - elif inp_is_bipolar and (not wt_is_bipolar): - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and (not wt_is_bipolar): - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Identity" + def calc_tmem(self): + """Calculates and returns TMEM.""" + if self.get_nodeattr("noActivation") == 1: + return 0 + else: + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + return ch // pe - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle != "ultra") + or (mmode == "const") + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier - return ret + def bram_estimation(self): + """Calculates resource estimation for BRAM""" + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # since this is HLS memory, not using the full width of a BRAM + # assuming memories up to 128 deep get implemented in LUTs + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + or (mstyle == "auto" and self.calc_wmem() <= 128) + or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - ch = self.get_nodeattr("Channels") - k_h, k_w = self.get_nodeattr("Kernel") - wmem = self.calc_wmem() - assert orig_weight_matrix.shape == ( - ch, - 1, - k_h, - k_w, - ), """Weights matrix doesn't - have expected shape (channels, 1, kernel_size, kernel_size)""" - ret = orig_weight_matrix - if self.get_weight_datatype() == DataType["BIPOLAR"]: - # convert bipolar to binary - ret = (ret + 1) / 2 - ret = ret.reshape(ch, k_h * k_w) - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - ret = ret.reshape(1, pe, wmem, simd) - return ret + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32)) - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 - * for bipolar weights&inputs, ensure thresholds are positive - * interleave rows between PEs - * reshape into (PE, TMEM, n_thres_steps) and return + def bram_efficiency_estimation(self): + P = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * P * omega + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + + def uram_efficiency_estimation(self): + """Function for URAM efficiency estimation: actual parameter storage + needed divided by the allocated URAM storage (from estimation)""" + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = int(np.prod(self.get_nodeattr("Kernel"))) + D_out = self.get_nodeattr("Channels") + uram_est = self.uram_estimation() + if uram_est == 0: + return 1 + wbits = W * D_in * D_out + uram_est_capacity = uram_est * 72 * 4096 + return wbits / uram_est_capacity + + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_datatype = self.get_accumulator_datatype() + acc_bits = acc_datatype.bitwidth() + k_h, k_w = self.get_nodeattr("Kernel") + # if accDataType is not set, then it will default to INT32, which would + # be a large overestimate in most (if not all) cases. In this scenario, + # we would use the minimum accumulator as determined by the data types + # bound, derived in https://arxiv.org/abs/2301.13376 + alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed()) + acc_bits = min( + acc_datatype.bitwidth(), + np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), + ) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + # TODO - add 'ram_style_threshold' node attribute + if noact == 0: + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 + ) + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + + def get_exp_cycles(self): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + ch = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + k_h, k_w = self.get_nodeattr("Kernel") + # currently FINN supports for vvau a batch size of 1 + batch_size = 1 + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv + return int(exp_cycles) + + def minimize_accumulator_width(self, model): + """Minimize the accumulator bit width according to the weight values, + input data types, and size of dot product""" + weights = model.get_initializer(self.onnx_node.input[1]) + k_h, k_w = self.get_nodeattr("Kernel") + fm = self.get_nodeattr("Channels") + # put weights into the shape expected by calculate_matvec_accumulator_range + weights = weights.reshape(fm, k_h * k_w).transpose() + # since in the calculation the values of the weight matrix are used, + # for the bipolar case they need to be converted to bipolar + if self.get_nodeattr("binaryXnorMode"): + weights = 2 * weights - 1 + if len(self.onnx_node.input) > 2: + thresholds = model.get_initializer(self.onnx_node.input[2]) + else: + thresholds = None + idt = self.get_input_datatype() + + (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) + # if runtime-writeable weights, then the values of the weights can + # change and we need to use the worst-case values from the datatypes + if self.get_nodeattr("runtime_writeable_weights"): + wdt = self.get_weight_datatype() + lower_worst = wdt.min() * np.ones_like(weights) + lower_range = calculate_matvec_accumulator_range(lower_worst, idt) + upper_worst = wdt.max() * np.ones_like(weights) + upper_range = calculate_matvec_accumulator_range(upper_worst, idt) + acc_min = min(min(lower_range), min(upper_range)) + acc_max = max(max(upper_range), max(upper_range)) + + # if the thresholds can be used to determine range, then adjust the range + # according to the known values of the thresholds + if thresholds is not None: + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + # set threshold datatype (and accumulator datatype implicitly) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + # clip threshold values + if max_threshold > acc_max or min_threshold < acc_min: + warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) + thresholds = np.clip(thresholds, acc_min, acc_max) + model.set_initializer(self.onnx_node.input[2], thresholds) + threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + acc_min = min(min_threshold, acc_min) + acc_max = max(max_threshold, acc_max) + + # if the acc_range is always greater than 0, then acc_max <= 2^P - 1 + if acc_min >= 0: + acc_bit_width = np.log2(acc_max + 1) + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"UINT{acc_bit_width}"] + # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <= + # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max) + else: + _acc_max = max(-acc_min, 1 + acc_max) + acc_bit_width = np.log2(_acc_max) + 1 + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"INT{acc_bit_width}"] + + # if activation, assert that the thresholds can be expressed with adt + if thresholds is not None: + assert np.vectorize(adt.allowed)( + threshold_tensor + ).all(), "Thresholds in %s can't be expressed with type %s" % ( + self.onnx_node.name, + str(adt), + ) + + # if no activation, output and accumulator datatypes are the same + if self.get_nodeattr("noActivation"): + # if this is the last node in the graph, then ensure the datatype is + # divisibly by 8 bits + if model.find_direct_successors(self.onnx_node) is None: + bw = roundup_to_integer_multiple(adt.bitwidth(), 8) + new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) + adt = DataType[new_adt_name] + # for no-activation nodes, output dt = acc dt + self.set_nodeattr("outputDataType", adt.name) + self.set_nodeattr("accDataType", adt.name) + + return DataType[self.get_nodeattr("accDataType")] + + def minimize_weight_bit_width(self, model): + """Minimize the bit width based on the values of the weights""" + if not self.get_nodeattr("runtime_writeable_weights"): + weights = model.get_initializer(self.onnx_node.input[1]) + w_min = weights.min() + w_max = weights.max() + if w_min < 0: + if abs(w_min) > w_max: + wdt = DataType.get_smallest_possible(w_min) + else: + wdt = DataType.get_smallest_possible(-w_max - 1) + else: + wdt = DataType.get_smallest_possible(w_max) + self.set_nodeattr("weightDataType", wdt.name) + return DataType[self.get_nodeattr("weightDataType")] + + def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for bipolar weights&inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return """ ch = self.get_nodeattr("Channels") pe = self.get_nodeattr("PE") @@ -449,6 +619,29 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) + def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + ch = self.get_nodeattr("Channels") + k_h, k_w = self.get_nodeattr("Kernel") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + ch, + 1, + k_h, + k_w, + ), """Weights matrix doesn't + have expected shape (channels, 1, kernel_size, kernel_size)""" + ret = orig_weight_matrix + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 + ret = ret.reshape(ch, k_h * k_w) + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + ret = ret.reshape(1, pe, wmem, simd) + return ret + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig @@ -626,384 +819,44 @@ def generate_params(self, model, path): f_thresh.write(thresholds_hls_code) f_thresh.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - mem_mode = self.get_nodeattr("mem_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for VectorVectorActivation") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - # reinterpret binary output as bipolar where needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == self.get_normal_output_shape() - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - - if mem_mode == "external" or mem_mode == "decoupled": - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - # we have converted bipolar weights to binary for export, - # so use it as such for weight generation - if self.get_weight_datatype() == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) - dim_h, dim_w = self.get_nodeattr("Dim") - num_w_reps = dim_h * dim_w - - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - if self.calc_tmem() != 0: - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] - - def defines(self, var): - dim_h, dim_w = self.get_nodeattr("Dim") - numReps = 1 * dim_h * dim_w + def get_op_and_param_counts(self): k_h, k_w = self.get_nodeattr("Kernel") - innerProdDim = k_h * k_w - mem_mode = self.get_nodeattr("mem_mode") - - self.code_gen_dict["$DEFINES$"] = [ - """#define Channels1 {}\n #define InnerProdDim {}\n - #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format( - self.get_nodeattr("Channels"), - innerProdDim, - self.get_nodeattr("SIMD"), - self.get_nodeattr("PE"), - numReps, - ) - ] - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - elem_bits = wdt.bitwidth() - packed_bits = self.get_weightstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = wdt.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/weights.npy" % code_gen_dir - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - npy_in, - self.hls_sname(), - ) - ) - - def strm_decl(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0_{} ("in0_{}");'.format( - self.get_instream_width(), self.hls_sname(), self.hls_sname() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out_{} ("out_{}");'.format( - self.get_outstream_width(), self.hls_sname(), self.hls_sname() - ) - ) - if mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights_{} ("weights_{}");'.format( - self.get_weightstream_width(), self.hls_sname(), self.hls_sname() - ) - ) + fm = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_repetitions = int(dim_h * dim_w) + mac_count = k_h * k_w * fm * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = k_h * k_w * fm + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = fm + ret_dict[thres_param_type] = thres_count + return ret_dict - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - map_to_hls_mult_style = { - "auto": "ap_resource_dflt()", - "lut": "ap_resource_lut()", - "dsp": "ap_resource_dsp()", + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, } - tmpl_args = self.get_template_param_values() - if self.calc_tmem() == 0: - odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() - threshs = "PassThroughActivation<%s>()" % odtype_hls_str - else: - threshs = "threshs" - - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Vector_Vector_Activate_Batch - (in0_{}, out_{}, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - if wdt == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - else: - export_wdt = wdt - wdtype_hls_str = export_wdt.get_hls_datatype_str() - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} - (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( - "Vector_Vector_Activate_Stream_Batch", - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - wdtype_hls_str, - self.hls_sname(), - self.hls_sname(), - self.hls_sname(), - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - self.hls_sname(), - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}( - hls::stream> &in0_{}, - hls::stream> &weights_{}, - hls::stream> &out_{} - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.hls_sname(), - self.get_weightstream_width(), - self.hls_sname(), - self.get_outstream_width(), - self.hls_sname(), - ) - ] - else: - raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" - ) - - def pragmas(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - - if mem_mode == "const": - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") - ) - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() - ) - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or external, - currently no other parameter value is supported!""" - ) - - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") - ) - self.code_gen_dict["$PRAGMAS$"].append( - ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") - ) - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") - sname = self.hls_sname() - if mem_mode == "external": - intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names + if mem_mode in ["decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) def code_generation_ipi(self): cmd = [] @@ -1111,207 +964,4 @@ def code_generation_ipi(self): return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for VectorVectorActivation") - return cmd - - def uram_estimation(self): - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - mem_width = Q * W * P - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const") - or (mmode == "external") - ): - return 0 - width_multiplier = math.ceil(mem_width / 72) - depth_multiplier = math.ceil(omega / 4096) - return width_multiplier * depth_multiplier - - def bram_estimation(self): - """Calculates resource estimation for BRAM""" - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - mem_width = Q * W * P - # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # since this is HLS memory, not using the full width of a BRAM - # assuming memories up to 128 deep get implemented in LUTs - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mstyle == "auto" and self.calc_wmem() <= 128) - or (mmode == "const" and self.calc_wmem() <= 128) - or (mmode == "external") - ): - return 0 - - if mem_width == 1: - return math.ceil(omega / 16384) - elif mem_width == 2: - return math.ceil(omega / 8192) - elif mem_width <= 4: - return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) - elif mem_width <= 9: - return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8)) - elif mem_width <= 18 or omega > 512: - return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16)) - else: - return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32)) - - def bram_efficiency_estimation(self): - P = self.get_nodeattr("PE") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - bram16_est = self.bram_estimation() - if bram16_est == 0: - return 1 - wbits = W * P * omega - bram16_est_capacity = bram16_est * 36 * 512 - return wbits / bram16_est_capacity - - def lut_estimation(self): - """Calculates resource estimations for LUTs based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - # determine tdt with input and weight data types - idt = self.get_input_datatype() - A = idt.bitwidth() - # parameters from experiments in paper mentioned above - c0 = 300 - c1 = 1.1 - c2 = 0 - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 - ): - c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) - - # multiplication - res_type = self.get_nodeattr("resType") - if res_type == "dsp": - mult_luts = 0 - else: - mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) - # adder tree - addertree_luts = (W + A) * (2 * Q - 1) - # accumulator - acc_datatype = self.get_accumulator_datatype() - acc_bits = acc_datatype.bitwidth() - k_h, k_w = self.get_nodeattr("Kernel") - # if accDataType is not set, then it will default to INT32, which would - # be a large overestimate in most (if not all) cases. In this scenario, - # we would use the minimum accumulator as determined by the data types - # bound, derived in https://arxiv.org/abs/2301.13376 - alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed()) - acc_bits = min( - acc_datatype.bitwidth(), - np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), - ) - acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - # TODO - add 'ram_style_threshold' node attribute - if noact == 0: - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 - comp_luts = (2**B - 1) * acc_bits - - return int( - c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 - ) - - def dsp_estimation(self): - # multiplication - P = self.get_nodeattr("PE") - res_type = self.get_nodeattr("resType") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - idt = self.get_input_datatype() - A = idt.bitwidth() - if res_type == "dsp": - mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling - else: - mult_dsp = 0 - return int(mult_dsp) - - def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if ( - self.get_nodeattr("mem_mode") == "decoupled" - or self.get_nodeattr("mem_mode") == "external" - ): - simd = self.get_nodeattr("SIMD") - pe = self.get_nodeattr("PE") - wp = self.get_weight_datatype().bitwidth() - w_width = simd * pe * wp - return w_width - else: - return 0 - - def get_weightstream_width_padded(self): - """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" - weight_width = self.get_weightstream_width() - return roundup_to_integer_multiple(weight_width, 8) - - def get_op_and_param_counts(self): - k_h, k_w = self.get_nodeattr("Kernel") - fm = self.get_nodeattr("Channels") - dim_h, dim_w = self.get_nodeattr("Dim") - weight_bits = self.get_weight_datatype().bitwidth() - inp_bits = self.get_input_datatype().bitwidth() - num_repetitions = int(dim_h * dim_w) - mac_count = k_h * k_w * fm * num_repetitions - # cannonicalize op type: highest bitwidth operand first s.t. - # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types - bw1 = min(inp_bits, weight_bits) - bw2 = max(inp_bits, weight_bits) - mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) - weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = k_h * k_w * fm - ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = fm - ret_dict[thres_param_type] = thres_count - return ret_dict - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + return cmd \ No newline at end of file From f7d0ad9355f3014af5d92a0750d8bec4b8b5c8fb Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 1 Feb 2024 14:36:31 +0000 Subject: [PATCH 099/291] [vvau hls-op]: refactored HLS custom-op VVAU --- .../hls/vectorvectoractivation_hls.py | 372 ++++++++++++++++++ 1 file changed, 372 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py new file mode 100644 index 0000000000..51de49f1c7 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -0,0 +1,372 @@ +# Copyright (c) 2020, Xilinx +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import textwrap +import warnings +from qonnx.core.datatype import DataType +from qonnx.util.basic import ( + calculate_matvec_accumulator_range, + interleave_matrix_outer_dim_from_partitions, + roundup_to_integer_multiple, +) + +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) +from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend + +class VectorVectorActivation_hls(VectorVectorActivation, HLSBackend): + """Corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(VectorVectorActivation.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # fill in TSrcI and TWeightI + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["const", "decoupled", "external"]: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + if self.calc_tmem() != 0: + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + def defines(self, var): + dim_h, dim_w = self.get_nodeattr("Dim") + numReps = 1 * dim_h * dim_w + k_h, k_w = self.get_nodeattr("Kernel") + innerProdDim = k_h * k_w + mem_mode = self.get_nodeattr("mem_mode") + + self.code_gen_dict["$DEFINES$"] = [ + """#define Channels1 {}\n #define InnerProdDim {}\n + #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format( + self.get_nodeattr("Channels"), + innerProdDim, + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + numReps, + ) + ] + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + if mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } + tmpl_args = self.get_template_param_values() + if self.calc_tmem() == 0: + odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() + threshs = "PassThroughActivation<%s>()" % odtype_hls_str + else: + threshs = "threshs" + + if mem_mode == "const": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Vector_Vector_Activate_Batch + (in0_{}, out_{}, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} + (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( + "Vector_Vector_Activate_Stream_Batch", + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or "external", + currently no other parameter value is supported!""" + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_weightstream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + else: + raise Exception( + """Please set mem_mode to "const" or "decoupled", currently no other + parameter value is supported!""" + ) + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + if mem_mode == "const": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") + ) + elif mem_mode == "decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() + ) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", or external, + currently no other parameter value is supported!""" + ) + + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") + ) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") + ) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names \ No newline at end of file From f9b8fbcdf614bf060a2ce7a6faf45502b9cef9ba Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 1 Feb 2024 14:37:12 +0000 Subject: [PATCH 100/291] [convert-to-hw]: added transformations to infer binary-MVAU and VVAU --- .../fpgadataflow/convert_to_hw_layers.py | 279 ++++++++++++++++++ 1 file changed, 279 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index eb6dd337f5..26cd0b74ad 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1281,6 +1281,139 @@ def apply(self, model): return (model, graph_modified) +class InferBinaryMatrixVectorActivation(Transformation): + """Convert XnorPopcountMatMul layers to + MatrixVectorActivation layers. Any immediately following MultiThreshold + layers will also be absorbed into the MVTU.""" + + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "XnorPopcountMatMul": + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], ( + n.name + + """: First + input for xnorpopcount is not Wset to FINN DataType BINARY.""" + ) + assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], ( + n.name + + """: Second + input (weights) for xnorpopcount is not set to FINN DataType BINARY.""" + ) + idt = DataType["BINARY"] + wdt = DataType["BINARY"] + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # extract weight shape, note that ONNX and finn-hlslib + # make different assumptions about dim order here + # ONNX assumes W has (in, out) shape + # finn-hlslib assumes W has (out, in) shape + mh = int(W.shape[1]) + mw = int(W.shape[0]) + # create node with no parallelization first + pe = 1 + simd = 1 + wmem = mw * mh // (pe * simd) + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisiable by + (WMEM * PE * SIMD) is violated.""" + ) + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # TODO ensure integer thresholds? + # create MVTU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor MH.""" + ) + odt = model.get_tensor_datatype(mt_output) + if odt.bitwidth() == 1: + # covers both bipolar and binary + actval = 0 + else: + actval = odt.min() + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=1, + noActivation=0, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name=n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MatrixVectorActivation", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + binaryXnorMode=1, + noActivation=1, + numInputVectors=list(mm_in_shape[:-1]), + mem_mode=self.mem_mode, + name=n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + class InferQuantizedMatrixVectorActivation(Transformation): """Convert MatMul layers with quantized inputs and weights to MatrixVectorActivation layers.""" @@ -1415,4 +1548,150 @@ def apply(self, model): if graph_modified: model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) + return (model, graph_modified) + +class InferVectorVectorActivation(Transformation): + """Convert MatMul layers with quantized inputs and weights to + VectorVectorActivation layers, if the sparsity annotation + of the weight matrix indicates that the MatMul layer belongs to + a depthwise convolution. Any immediately following MultiThreshold + layers will also be absorbed into the VVAU.""" + + def __init__(self, mem_mode="const"): + super().__init__() + self.mem_mode = mem_mode + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is not None: + sparsity = model.get_tensor_sparsity(n.input[1]) + try: + k_h, k_w = sparsity["dw"]["kernel_shape"] + except KeyError: + raise Exception( + n.name + + """: sparsity annotation doesn't indicate that MatMul + belongs to a depthwise convolution.""" + ) + + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + idt = model.get_tensor_datatype(mm_input) + wdt = model.get_tensor_datatype(mm_weight) + if idt.is_integer() and wdt.is_integer(): + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # infer dense weight tensor from sparse weight matrix + # kernel size (k_h, k_w) which was extracted above and the value of + # the channels is used. + # the weight matrix has a shape of (k_h * k_w * Channels, Channels) + # we need to reverse the creation of the sparse weight matrix + # to achieve a weight tensor of shape (Channels, 1, k_h, k_w) + channels = int(W.shape[1]) + # transpose to achieve a shape of (k_h * k_w * Channels, Channels) + W = W.T + # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards + # to (Channels, Channels, k_h, k_w) + W = W.reshape(channels, k_h, k_w, channels) + W = W.transpose(0, 3, 1, 2) + # now we can extract the values using a for loop over the channels + # and fill a zero numpy array in the correct shape + w_tensor = np.zeros((channels, 1, k_h, k_w), dtype=np.float32) + for ch in range(channels): + w_tensor[ch][0] = W[ch][ch] + model.set_initializer(mm_weight, w_tensor) + model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w)) + # create node with pe=channels as default + pe = channels + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # create VVAU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == channels, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor Channels.""" + ) + odt = model.get_tensor_datatype(mt_output) + scale = getCustomOp(consumer).get_nodeattr("out_scale") + assert scale == 1.0, ( + consumer.name + ": out_scale must be equal to 1.0 for HLS conversion." + ) + actval = getCustomOp(consumer).get_nodeattr("out_bias") + assert int(actval) == actval, ( + consumer.name + ": out_bias must be integer for HLS conversion." + ) + actval = int(actval) + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + # create and insert new VectorVectorActivation node + new_node = helper.make_node( + "VectorVectorActivation", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + resType="lut", + PE=pe, + Dim=[mm_in_shape[1], mm_in_shape[2]], + Channels=channels, + Kernel=[k_h, k_w], + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + noActivation=0, + name="VectorVectorActivation_" + n.name, + mem_mode=self.mem_mode, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new VVAU node + new_node = helper.make_node( + "VectorVectorActivation", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + resType="lut", + PE=pe, + Dim=[mm_in_shape[1], mm_in_shape[2]], + Channels=channels, + Kernel=[k_h, k_w], + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + noActivation=1, + name="VectorVectorActivation_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) return (model, graph_modified) \ No newline at end of file From 8be157c07df653392210eafe9f8fdc9e2a08215e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 1 Feb 2024 15:42:14 +0000 Subject: [PATCH 101/291] [mvau/vvau hw-op]: remove duplicate node attribute --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 1 - src/finn/custom_op/fpgadataflow/vectorvectoractivation.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 7cf6c2b2cd..e5455e1850 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -133,7 +133,6 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), - "preferred_impl_style" : ("s", False, "hls", {"hls", "rtl"}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index 2168474298..af659dd936 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -104,10 +104,6 @@ def get_nodeattr_types(self): # use xnor-popcount for binary weights/inputs, thus treating them # as bipolar "binaryXnorMode": ("i", False, 0, {0, 1}), - # Backend implementation for layer - # hls -- Vivado HLS - # rtl -- (System)Verilog - "preferred_impl_style": ("s", False, "hls", {"hls", "rtl"}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs From 445cfa6b4c45b89b35504e2975e34f159e73c5fd Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Feb 2024 11:21:10 +0000 Subject: [PATCH 102/291] [hw vvau]: rename specific method to more generic name --- src/finn/custom_op/fpgadataflow/vectorvectoractivation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index af659dd936..e6a9e1e199 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -615,7 +615,7 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + def get_hw_compatible_weight_tensor(self, orig_weight_matrix): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") ch = self.get_nodeattr("Channels") @@ -652,7 +652,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): """ # convert weights into hlslib-compatible format - weight_tensor = self.get_hls_compatible_weight_tensor(weights) + weight_tensor = self.get_hw_compatible_weight_tensor(weights) export_wdt = self.get_weight_datatype() # we have converted bipolar weights to binary for export, # so use it as such for weight generation From e33104e662de62430411668122a3de36538ac7e2 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Feb 2024 12:28:21 +0000 Subject: [PATCH 103/291] [hw vvau]: minor bugfix to node execution --- .../fpgadataflow/vectorvectoractivation.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index e6a9e1e199..65431a18dd 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -127,17 +127,20 @@ def execute_node(self, context, graph): (_, dim_h, dim_w, _) = in_act.shape (k_h, k_w) = self.get_nodeattr("Kernel") channels = self.get_nodeattr("Channels") - # Reshape input activations in right format - in_act = in_act.reshape(1, dim_h, dim_w, channels, k_h*k_w) - in_act = in_act.transpose(0, 1, 2, 4, 3) + pe = self.get_nodeattr("PE") + # Reorder the input activations. Note that PE gets interleaved by the SWG, + # so we have to untangle and for simplicity of computation assume pe=1. + # Note that PE has no effect on the QONNX node + in_act = in_act.reshape(1, dim_h, dim_w, channels // pe, k_h*k_w, pe) + in_act = in_act.transpose(0, 1, 2, 4, 3, 5) in_act = in_act.reshape(1, dim_h, dim_w, channels*k_h*k_w) - # Reshape + # Reshape weights in appropriate format vvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] vvau_w = np_helper.to_array(vvau_w_init) vvau_w_onnx = self._infer_sparse_weight_tensor(vvau_w, k_h, k_w, channels) if self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR": - result = np.matmul(in_act, vvau_w_onnx) + result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format result = (result + k_h*k_w) / 2 else: result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format @@ -145,7 +148,7 @@ def execute_node(self, context, graph): if self.get_nodeattr("noActivation") == 0: vvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] vvau_thr = np_helper.to_array(vvau_thr_init) - odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"] + odt_is_bipolar = self.get_nodeattr("outputDataType") == "BIPOLAR" out_scale = 2 if odt_is_bipolar else 1 out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") # NHWC to NCHW for multithreshold node @@ -154,7 +157,6 @@ def execute_node(self, context, graph): # NCHW to NHWC result = result.transpose((0,2,3,1)) - # for i in range(self.get_nodeattr("Channels")): context[node.output[0]] = result def verify_node(self): From 6884030b68519327fbf447b6914bae99c1576a6e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Feb 2024 12:28:59 +0000 Subject: [PATCH 104/291] [test]: extend vvau test to simulate HW custom-op as well --- tests/fpgadataflow/test_fpgadataflow_vvau.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index 4208169c0b..447ba5148f 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -47,6 +47,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels): @@ -233,6 +234,10 @@ def test_fpgadataflow_vvau( W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode ) + input_dict = prepare_inputs(x_vvau) + y_hwop = oxe.execute_onnx(model, input_dict)["outp"] + model = model.transform(SpecializeLayers()) + if exec_mode == "cppsim": model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) @@ -246,8 +251,6 @@ def test_fpgadataflow_vvau( else: raise Exception("Unknown exec_mode in test_fpgadataflow_vvau") - input_dict = prepare_inputs(x_vvau) - # Calculate output if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: # Simulate XNOR-popcount matrix multiplication, see @@ -271,7 +274,8 @@ def test_fpgadataflow_vvau( y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)["outp"] - assert (y_produced == y_expected).all(), "incorrect result" + assert (y_hwop == y_expected).all(), "VVAU HW-op mismatches with golden output!" + assert (y_produced == y_expected).all(), "VVAU specialized-op mismatches with golden output!" if exec_mode == "rtlsim": node = model.get_nodes_by_op_type("VectorVectorActivation")[0] From 8aaec4b400623d2d63e193a66030fda892ea7b2b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Feb 2024 12:33:21 +0000 Subject: [PATCH 105/291] [hw mvau]: minor bugfix to node execution and cleaned up code --- .../fpgadataflow/matrixvectoractivation.py | 437 ++++++------------ 1 file changed, 151 insertions(+), 286 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index e5455e1850..63d8e586a1 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -140,25 +140,89 @@ def get_nodeattr_types(self): def base_op_type(self): return "MatrixVectorActivation" - def calc_wmem(self): - """Calculates and returns WMEM.""" - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - assert mw % simd == 0, "Requirement MW divisable by SIMD is violated." - wmem = mw * mh // (pe * simd) - return wmem + def execute_node(self, context, graph): + node = self.onnx_node + in_act = context[node.input[0]] + mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] + mvau_w = np_helper.to_array(mvau_w_init) + # Matrix multiplication + if self.get_nodeattr("binaryXnorMode"): + # Note: activation/weights are expected to be binary (by design coming from the transformation inferring this operation mode) + result = xp.xnorpopcountmatmul(in_act, mvau_w) + elif (self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR"): + # Convert to binary and use xnorpopcountmatmul function + result = xp.xnorpopcountmatmul((in_act+1)/2, (mvau_w+1)/2) + else: + # Regular matrix multiplication + result = np.matmul(in_act, mvau_w) + if self.get_nodeattr("noActivation") == 0: + mvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] + mvau_thr = np_helper.to_array(mvau_thr_init) + odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"] + out_scale = 2 if odt_is_bipolar else 1 + out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") + # NHWC to NCHW for multithreshold node + result = result.transpose((0,3,1,2)) + result = multithreshold(result, mvau_thr, out_scale, out_bias) + # NCHW to NHWC + result = result.transpose((0,2,3,1)) + + context[node.output[0]] = result - def calc_tmem(self): - """Calculates and returns TMEM.""" - if self.get_nodeattr("noActivation") == 1: - return 0 + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") else: - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - return mh // pe + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("resType") + self.get_nodeattr("MW") + self.get_nodeattr("MH") + self.get_nodeattr("SIMD") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("weightDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required MatrixVectorActivation attributes do not exist.""") + + # verify the number of inputs depending on noActivation value + # check noActivation value to determine the number of inputs + no_act = self.get_nodeattr("noActivation") + + if no_act == 1: + if len(self.onnx_node.input) == 2: + info_messages.append("The number of inputs is correct") + else: + info_messages.append( + """MatrixVectorActivation needs in no + activation mode 2 inputs (data input and weights)""" + ) + elif no_act == 0: + if len(self.onnx_node.input) == 3: + info_messages.append("The number of inputs is correct") + else: + info_messages.append( + """MatrixVectorActivation needs 3 inputs + (data input and weights and threshold values)""" + ) + else: + info_messages.append( + """noActivation attribute contains {} should + be 0 or 1""".format( + no_act + ) + ) + return info_messages def make_shape_compatible_op(self, model): oshape = self.get_normal_output_shape() @@ -194,9 +258,13 @@ def get_weight_datatype(self): """Returns FINN DataType of weights.""" return DataType[self.get_nodeattr("weightDataType")] + def get_accumulator_datatype(self): + """Returns FINN DataType of accumulator""" + return DataType[self.get_nodeattr("accDataType")] + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] + return DataType[self.get_nodeattr("outputDataType")] def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() @@ -234,61 +302,69 @@ def get_weightstream_width_padded(self): weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8) - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") + def get_folded_input_shape(self, ind=0): + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + sf = mw // simd + nf = mh // pe + vecs = list(self.get_nodeattr("numInputVectors")) + + if ind == 0: + # calculate shape of input 0 + folded_input_shape = tuple(vecs + [sf, simd]) + elif ind == 1 and self.get_nodeattr("mem_mode") == "external": + # calculate shape of input 1 (weights) + folded_input_shape = tuple(vecs + [sf * nf, simd * pe]) else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') + raise Exception("Undefined input shape for requested input") - # verify that all necessary attributes exist - # TODO collect automatically from get_nodeattr_types - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("resType") - self.get_nodeattr("MW") - self.get_nodeattr("MH") - self.get_nodeattr("SIMD") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType") - self.get_nodeattr("weightDataType") - self.get_nodeattr("outputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append("""The required MatrixVectorActivation attributes do not exist.""") + return folded_input_shape - # verify the number of inputs depending on noActivation value - # check noActivation value to determine the number of inputs - no_act = self.get_nodeattr("noActivation") + def get_folded_output_shape(self, ind=0): + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + nf = mh // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_output_shape = tuple(vecs + [nf, pe]) + return folded_output_shape - if no_act == 1: - if len(self.onnx_node.input) == 2: - info_messages.append("The number of inputs is correct") - else: - info_messages.append( - """MatrixVectorActivation needs in no - activation mode 2 inputs (data input and weights)""" - ) - elif no_act == 0: - if len(self.onnx_node.input) == 3: - info_messages.append("The number of inputs is correct") - else: - info_messages.append( - """MatrixVectorActivation needs 3 inputs - (data input and weights and threshold values)""" - ) - else: - info_messages.append( - """noActivation attribute contains {} should - be 0 or 1""".format( - no_act - ) - ) + def get_normal_input_shape(self, ind=0): + mw = self.get_nodeattr("MW") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [mw]) + return normal_input_shape - return info_messages + def get_normal_output_shape(self, ind=0): + mh = self.get_nodeattr("MH") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_output_shape = tuple(vecs + [mh]) + return normal_output_shape + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def calc_wmem(self): + """Calculates and returns WMEM.""" + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + assert mw % simd == 0, "Requirement MW divisable by SIMD is violated." + wmem = mw * mh // (pe * simd) + return wmem + + def calc_tmem(self): + """Calculates and returns TMEM.""" + if self.get_nodeattr("noActivation") == 1: + return 0 + else: + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + return mh // pe def uram_estimation(self): P = self.get_nodeattr("PE") @@ -454,25 +530,6 @@ def dsp_estimation(self): else: mult_dsp = 0 return int(mult_dsp) -# # TODO: fix DSP estimations --> depends on fpga_part -# def dsp_estimation(self): -# # multiplication -# # mvu_8sx9 (DSP58): ceil(SIMD/3) -# # mvu_4sx4u (DSP48/DSP58): ceil(PE/4) -# # mvu_8sx8u (DSP48): ceil(PE/2) -# # mvu_lut: 0 -# P = self.get_nodeattr("PE") -# res_type = self.get_nodeattr("resType") -# Q = self.get_nodeattr("SIMD") -# wdt = self.get_weight_datatype() -# W = wdt.bitwidth() -# idt = self.get_input_datatype() -# A = idt.bitwidth() -# if res_type == "dsp": -# mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling -# else: -# mult_dsp = 0 -# return int(mult_dsp) def get_exp_cycles(self): pe = self.get_nodeattr("PE") @@ -485,124 +542,6 @@ def get_exp_cycles(self): exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv return int(exp_cycles) -# # TODO: fix exp_cycles estimations --> depends on fpga_part and clk -# def get_exp_cycles(self): -# # mvu_8sx9 (DSP58): -# # 2 (replay_buffer) + ceil(chainlen/seglen) + 2 (MREG, PREG) + 2 (output reg slice) -# # + MW/SIMD * MH/PE -# # mvu_4sx4u (DSP48/DSP58) / mvu_8sx8u (DSP48): -# # 3 (IN_REG, MREG, PREG) + 2 (replay_buffer) + 2 (output reg slice) + 1 (adder tree SIMD) + 1 (output lane) -# # + MW/SIMD * MH/PE -# # mvu_lut: -# # 2 (replay_buffer) + 1 OR 2 (no MREG OR MREG) + 2 (output reg slice) -# # + MW/SIMD * MH/PE -# pe = self.get_nodeattr("PE") -# simd = self.get_nodeattr("SIMD") -# num_inp_vec = self.get_nodeattr("numInputVectors") -# mh = self.get_nodeattr("MH") -# mw = self.get_nodeattr("MW") -# # since mmv != 1 is not supported yet, we set mmv for now to 1 -# mmv = 1 -# exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv -# return int(exp_cycles) - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - # when performing FIFO insertion on an FC layer with ext weights, the ind - # parameter can be > 0 (referring to the weights) so handle that here - if ind == 0: - return DataType[self.get_nodeattr("inputDataType")] - elif ind == 1: - return DataType[self.get_nodeattr("weightDataType")] - else: - raise Exception("Undefined input ind for this layer type") - - def get_accumulator_datatype(self): - """Returns FINN DataType of accumulator""" - return DataType[self.get_nodeattr("accDataType")] - - def get_weight_datatype(self): - """Returns FINN DataType of weights.""" - return DataType[self.get_nodeattr("weightDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - in_width = i_bits * self.get_nodeattr("SIMD") - return in_width - - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - out_width = o_bits * self.get_nodeattr("PE") - return out_width - - def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if ( - self.get_nodeattr("mem_mode") == "decoupled" - or self.get_nodeattr("mem_mode") == "external" - ): - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - wp = self.get_weight_datatype().bitwidth() - w_width = pe * simd * wp - return w_width - else: - return 0 - - def get_weightstream_width_padded(self): - """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" - weight_width = self.get_weightstream_width() - return roundup_to_integer_multiple(weight_width, 8) - - def get_folded_input_shape(self, ind=0): - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") - simd = self.get_nodeattr("SIMD") - pe = self.get_nodeattr("PE") - sf = mw // simd - nf = mh // pe - vecs = list(self.get_nodeattr("numInputVectors")) - - if ind == 0: - # calculate shape of input 0 - folded_input_shape = tuple(vecs + [sf, simd]) - elif ind == 1 and self.get_nodeattr("mem_mode") == "external": - # calculate shape of input 1 (weights) - folded_input_shape = tuple(vecs + [sf * nf, simd * pe]) - else: - raise Exception("Undefined input shape for requested input") - - return folded_input_shape - - def get_folded_output_shape(self, ind=0): - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - nf = mh // pe - vecs = list(self.get_nodeattr("numInputVectors")) - folded_output_shape = tuple(vecs + [nf, pe]) - return folded_output_shape - - def get_normal_input_shape(self, ind=0): - mw = self.get_nodeattr("MW") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_input_shape = tuple(vecs + [mw]) - return normal_input_shape - - def get_normal_output_shape(self, ind=0): - mh = self.get_nodeattr("MH") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_output_shape = tuple(vecs + [mh]) - return normal_output_shape - - def get_number_output_values(self): - nf = np.prod(self.get_folded_output_shape()[:-1]) - return nf - def minimize_accumulator_width(self, model): """Minimize the accumulator bit width according to the weight values, input data types, and size of dot product""" @@ -1003,30 +942,6 @@ def derive_characteristic_fxns(self, period): io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - def execute_node(self, context, graph): - node = self.onnx_node - in_act = context[node.input[0]] - mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] - mvau_w = np_helper.to_array(mvau_w_init) - # Matrix multiplication - if self.get_nodeattr("binaryXnorMode"): - # Note: activation/weights are expected to be binary (by design coming from the transformation inferring this operation mode) - result = xp.xnorpopcountmatmul(in_act, mvau_w) - elif (self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR"): - result = xp.xnorpopcountmatmul((in_act+1)/2, (mvau_w+1)/2) - else: - result = np.matmul(in_act, mvau_w) - # Thresholding if noActivation==0 - if self.get_nodeattr("noActivation") == 0: - mvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] - mvau_thr = np_helper.to_array(mvau_thr_init) - odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"] - out_scale = 2 if odt_is_bipolar else 1 - out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") - result = multithreshold(result, mvau_thr, out_scale, out_bias) - - context[node.output[0]] = result - def code_generation_ipi(self): cmd = [] # add streamer if needed @@ -1056,37 +971,11 @@ def code_generation_ipi(self): "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - is_rtl_op = self.__class__.__name__ == "MatrixVectorActivation_rtl" - if is_rtl_op: - # instantiate the RTL block - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") - sourcefiles = [ - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" - ), - rtllib_dir + "mvu_vvu_axi.sv", - rtllib_dir + "replay_buffer.sv", - rtllib_dir + "mvu_4sx4u.sv", - rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", - rtllib_dir + "mvu_8sx8u_dsp48.sv", - ] - for f in sourcefiles: - cmd.append("add_files -norecurse %s" % (f)) - cmd.append( - "create_bd_cell -type hier -reference %s /%s/%s" - % ( - self.get_nodeattr("gen_top_module"), - self.onnx_node.name, - self.onnx_node.name, - ) - ) - else: - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + # instantiate the hls ip + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (self.get_nodeattr("ip_vlnv"), node_name, node_name) + ) # instantiate a streamer and connect it to the HLS IP strm_vlnv = "amd.com:finn:memstream:1.0" @@ -1159,32 +1048,8 @@ def code_generation_ipi(self): cmd.append("assign_bd_address") cmd.append("save_bd_design") elif mem_mode == "const" or mem_mode == "external": - if is_rtl_op and mem_mode == "external": - # instantiate the RTL block - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") - sourcefiles = [ - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" - ), - rtllib_dir + "mvu_vvu_axi.sv", - rtllib_dir + "replay_buffer.sv", - rtllib_dir + "mvu_4sx4u.sv", - rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", - rtllib_dir + "mvu_8sx8u_dsp48.sv", - ] - for f in sourcefiles: - cmd.append("add_files -norecurse %s" % (f)) - cmd.append( - "create_bd_cell -type module -reference %s %s" - % ( - self.get_nodeattr("gen_top_module"), - self.onnx_node.name, - ) - ) - else: - # base class impl sufficient for const/external modes - return super().code_generation_ipi() + # base class impl sufficient for const/external modes + return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd \ No newline at end of file From 07f977eaebdfcb65ff18440d3b76ddad03a3aec2 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Feb 2024 12:42:41 +0000 Subject: [PATCH 106/291] [test]: cleaned up mvau test --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index bd283855e3..e862900e2b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -238,9 +238,7 @@ def test_fpgadataflow_fclayer_hwop(idt, wdt, act, nf, sf, mw, mh): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_hlsop_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): - if idt == DataType["BIPOLAR"] and wdt != DataType["BIPOLAR"] or idt != DataType["BIPOLAR"] and wdt == DataType["BIPOLAR"]: - pytest.skip("Bipolar activations/weights only supported in MVU if both operands are bipolar") +def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: @@ -281,6 +279,7 @@ def test_fpgadataflow_fclayer_hlsop_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + # Note: only HLS-based MVAU layers execute CPPsim inst.set_nodeattr("preferred_impl_style", "hls") model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("cppsim")) @@ -327,14 +326,10 @@ def test_fpgadataflow_fclayer_hlsop_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, @pytest.mark.parametrize("mw", [16]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [16]) -# Backend -@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend): - if backend == "rtl" and act is not None: - pytest.skip("RTL MVU doesn't support embedded thresholding functionality.") +def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: @@ -375,7 +370,6 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, ba # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) - inst.set_nodeattr("preferred_impl_style", backend) # prepare input data input_dict = prepare_inputs(x, idt, wdt) @@ -397,7 +391,8 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, ba # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... model = model.transform(SpecializeLayers()) - model = model.transform(SetExecMode("rtlsim")) + # model = model.transform(SetExecMode("rtlsim")) + model.set_metadata_prop("exec_mode", "rtlsim") model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) model = model.transform(HLSSynthIP()) @@ -406,8 +401,11 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, ba assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - if backend == "hls": assert "MatrixVectorActivation_hls_0" in hls_synt_res_est + assert "MatrixVectorActivation_hls_0" in hls_synt_res_est + else: + assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est + assert "MatrixVectorActivation_hls_0" in hls_synt_res_est else: assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est From 3466e882d14b7c2a2e0848268aac031c61b78436 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Feb 2024 12:43:09 +0000 Subject: [PATCH 107/291] [hw mvau]: minor bugfix --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 63d8e586a1..8f8292e994 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -158,7 +158,7 @@ def execute_node(self, context, graph): if self.get_nodeattr("noActivation") == 0: mvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] mvau_thr = np_helper.to_array(mvau_thr_init) - odt_is_bipolar = self.get_nodeattr("outputDataType") == DataType["BIPOLAR"] + odt_is_bipolar = self.get_nodeattr("outputDataType") == "BIPOLAR" out_scale = 2 if odt_is_bipolar else 1 out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") # NHWC to NCHW for multithreshold node From 496869fc8fbd34ff343aa417a1d27f4997744985 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Feb 2024 14:13:39 +0000 Subject: [PATCH 108/291] updated copyright header --- .../custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py | 2 +- .../custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py | 2 +- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +- src/finn/custom_op/fpgadataflow/vectorvectoractivation.py | 2 +- tests/fpgadataflow/test_fpgadataflow_mvau.py | 2 +- tests/fpgadataflow/test_fpgadataflow_vvau.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index 2ad9fefc07..e27e77fe4f 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py index 51de49f1c7..615ff7c71e 100644 --- a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 8f8292e994..04594f4109 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index 65431a18dd..e793321879 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index e862900e2b..38f77e3836 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index 447ba5148f..1cb64dda91 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without From 2910acaf0c22d31d6de28388c400653561635ab6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 2 Feb 2024 15:08:06 +0000 Subject: [PATCH 109/291] [hls vvau]: add execute_node function --- .../hls/vectorvectoractivation_hls.py | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py index 615ff7c71e..c824f9682c 100644 --- a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -59,6 +59,112 @@ def get_nodeattr_types(self): my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for VectorVectorActivation") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_normal_output_shape() + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + + if mem_mode == "external" or mem_mode == "decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) + dim_h, dim_w = self.get_nodeattr("Dim") + num_w_reps = dim_h * dim_w + + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + def get_template_param_values(self): """Returns the template parameter values according to input, output and weight data types.""" From b4fb60458bf0b5e52425623bd2ee0f64f51e4d06 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 2 Feb 2024 15:33:52 +0000 Subject: [PATCH 110/291] [CustomOp] re arrange threshold mem_mode related attributes to match class hiearchy --- .../fpgadataflow/hls/thresholding_hls.py | 19 +----------------- .../fpgadataflow/rtl/thresholding_rtl.py | 4 ---- .../custom_op/fpgadataflow/thresholding.py | 20 +++++++++++++++++++ 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index fb90365eef..1cd5f4d3ed 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -59,24 +59,7 @@ def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - # string defining memory type - "ram_style": ("s", False, "distributed", {"distributed", "block"}), - # memory mode for the thresholds - # const -- embedded thresholds, default - # decoupled -- streaming thresholds with streamer packaged inside IP - "mem_mode": ("s", False, "const", {"const", "decoupled"}), - # (mem_mode = decoupled only) whether weights (thresholds) will be - # writable through an AXI-lite interface during runtime - # 1 for enabled, 0 for disabled. - # see finn-rtllib/memstream/doc/README for more about the memory - # address map used for writable weights - # IMPORTANT: After using AXI lite to either read or write the weights, - # always "flush" the accelerator by first passing a dummy input - # vector through the accelerator. This will get rid of any old - # weight data from the weight FIFOs. - "runtime_writeable_weights": ("i", False, 0, {0, 1}), - } + my_attrs = {} my_attrs.update(Thresholding.get_nodeattr_types(self)) my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index a539ab6f84..50e30efc4f 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -78,10 +78,6 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): my_attrs = { - # whether weights (thresholds) will be - # writable through an AXI-lite interface during runtime - # 1 for enabled, 0 for disabled. - "runtime_writeable_weights": ("i", False, 0, {0, 1}), # memory depth triggers for threshold storage "depth_trigger_uram": ("i", False, 0), "depth_trigger_bram": ("i", False, 0), diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index 1ce059358e..8494cf97bb 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -42,6 +42,26 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): my_attrs = { + # memory mode for the thresholds + # const -- embedded thresholds, default + # decoupled -- streaming thresholds with streamer packaged inside IP + "mem_mode": ("s", False, "const", {"const", "decoupled"}), + # whether weights (thresholds) will be + # writable through an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # FPGA resource type for memories in decoupled mode + # auto -- let Vivado decide + # block -- use BRAM + # distributed -- use LUTRAM + # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1 + # see also https://www.xilinx.com/support/answers/38070.html + "ram_style": ( + "s", + False, + "auto", + {"auto", "block", "distributed", "ultra"}, + ), # parallelization; channels thresholded per cycle "PE": ("i", True, 0), # number of channels (each may have different thresholds) From 9ec0a3dd0d1bf9048cc52829867c4fa382080e0a Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 2 Feb 2024 15:35:50 +0000 Subject: [PATCH 111/291] [rtllib] Remove threshold IP Package --- finn-rtllib/thresholding/component.xml | 1002 ----------------- .../gui/thresholding_axi_v1_0.gtcl | 4 - .../xgui/thresholding_axi_v1_0.tcl | 187 --- 3 files changed, 1193 deletions(-) delete mode 100644 finn-rtllib/thresholding/component.xml delete mode 100644 finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl delete mode 100644 finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl diff --git a/finn-rtllib/thresholding/component.xml b/finn-rtllib/thresholding/component.xml deleted file mode 100644 index e28a3a2c2d..0000000000 --- a/finn-rtllib/thresholding/component.xml +++ /dev/null @@ -1,1002 +0,0 @@ - - - amd.com - finn - thresholding_axi - 1.0 - - - ap_clk - - - - - - - CLK - - - ap_clk - - - - - - ASSOCIATED_RESET - ap_rst_n - - - ASSOCIATED_BUSIF - s_axilite:s_axis:m_axis - - - FREQ_TOLERANCE_HZ - -1 - - - - - m_axis - - - - - - - TDATA - - - m_axis_tdata - - - - - TVALID - - - m_axis_tvalid - - - - - TREADY - - - m_axis_tready - - - - - - s_axis - - - - - - - TDATA - - - s_axis_tdata - - - - - TVALID - - - s_axis_tvalid - - - - - TREADY - - - s_axis_tready - - - - - - s_axilite - - - - - - - - - AWADDR - - - s_axilite_AWADDR - - - - - AWVALID - - - s_axilite_AWVALID - - - - - AWREADY - - - s_axilite_AWREADY - - - - - WDATA - - - s_axilite_WDATA - - - - - WSTRB - - - s_axilite_WSTRB - - - - - WVALID - - - s_axilite_WVALID - - - - - WREADY - - - s_axilite_WREADY - - - - - BRESP - - - s_axilite_BRESP - - - - - BVALID - - - s_axilite_BVALID - - - - - BREADY - - - s_axilite_BREADY - - - - - ARADDR - - - s_axilite_ARADDR - - - - - ARVALID - - - s_axilite_ARVALID - - - - - ARREADY - - - s_axilite_ARREADY - - - - - RDATA - - - s_axilite_RDATA - - - - - RRESP - - - s_axilite_RRESP - - - - - RVALID - - - s_axilite_RVALID - - - - - RREADY - - - s_axilite_RREADY - - - - - - ap_rst_n - - - - - - - RST - - - ap_rst_n - - - - - - POLARITY - ACTIVE_LOW - - - - - - - s_axilite - s_axilite - - reg0 - reg0 - 0x0 - 4096 - 32 - register - - - - - - - xilinx_anylanguagesynthesis - Synthesis - :vivado.xilinx.com:synthesis - Verilog - thresholding_axi_wrapper - - xilinx_anylanguagesynthesis_view_fileset - - - - viewChecksum - fd0bd85b - - - - - xilinx_anylanguagebehavioralsimulation - Simulation - :vivado.xilinx.com:simulation - Verilog - thresholding_axi_wrapper - - xilinx_anylanguagebehavioralsimulation_view_fileset - - - - viewChecksum - fd0bd85b - - - - - xilinx_xpgui - UI Layout - :vivado.xilinx.com:xgui.ui - - xilinx_xpgui_view_fileset - - - - viewChecksum - fc6b9b63 - - - - - xilinx_utilityxitfiles - Utility XIT/TTCL - :vivado.xilinx.com:xit.util - - xilinx_utilityxitfiles_view_fileset - - - - viewChecksum - 8b0215cd - - - - - - - ap_clk - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - ap_rst_n - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_AWVALID - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_AWREADY - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_AWADDR - - in - - 5 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_WVALID - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_WREADY - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_WDATA - - in - - 31 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_WSTRB - - in - - 3 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 1 - - - - - s_axilite_BVALID - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_BREADY - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_BRESP - - out - - 1 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_ARVALID - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_ARREADY - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_ARADDR - - in - - 5 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_RVALID - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_RREADY - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - s_axilite_RDATA - - out - - 31 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axilite_RRESP - - out - - 1 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axis_tready - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axis_tvalid - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - s_axis_tdata - - in - - 15 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - m_axis_tready - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 1 - - - - - m_axis_tvalid - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_tdata - - out - - 7 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - - - N - N - 4 - - - K - K - 16 - - - C - C - 1 - - - PE - Pe - 1 - - - SIGNED - Signed - true - - - FPARG - Fparg - false - - - BIAS - Bias - 0 - - - CF - Cf - 1 - - - ADDR_BITS - Addr Bits - 6 - - - O_BITS - O Bits - 4 - - - - - - choice_list_9d8b0d81 - ACTIVE_HIGH - ACTIVE_LOW - - - - - xilinx_anylanguagesynthesis_view_fileset - - hdl/thresholding.sv - systemVerilogSource - - - hdl/thresholding_axi.sv - systemVerilogSource - - - hdl/thresholding_axi_wrapper.v - verilogSource - CHECKSUM_7b8c102d - - - hdl/axilite_if.v - verilogSource - CHECKSUM_69d1ba26 - xil_defaultlib - - - - xilinx_anylanguagebehavioralsimulation_view_fileset - - hdl/thresholding.sv - systemVerilogSource - - - hdl/thresholding_axi.sv - systemVerilogSource - - - hdl/thresholding_axi_wrapper.v - verilogSource - - - hdl/axilite_if.v - verilogSource - USED_IN_ipstatic - xil_defaultlib - - - - xilinx_xpgui_view_fileset - - xgui/thresholding_axi_v1_0.tcl - tclSource - CHECKSUM_fc6b9b63 - XGUI_VERSION_2 - - - - xilinx_utilityxitfiles_view_fileset - - gui/thresholding_axi_v1_0.gtcl - GTCL - - - - MultiThreshold - - - N - Output Precision - 4 - - - K - Input Precision - 16 - - - C - Channels - 1 - - - PE - Pe - 1 - - - SIGNED - Signed Inputs - true - - - FPARG - Floating-Point Inputs - false - - - BIAS - Bias - 0 - - - CF - Channel Fold - 1 - - - - false - - - - - - ADDR_BITS - Address Bits - 6 - - - - false - - - - - - O_BITS - Output Value Width - 4 - - - - false - - - - - - Component_Name - thresholding_axi_wrapper_v1_0 - - - - - - virtex7 - qvirtex7 - versal - kintex7 - kintex7l - qkintex7 - qkintex7l - akintex7 - artix7 - artix7l - aartix7 - qartix7 - zynq - qzynq - azynq - spartan7 - aspartan7 - virtexu - zynquplus - virtexuplus - virtexuplusHBM - virtexuplus58g - kintexuplus - artixuplus - kintexu - - - /UserIP - - thresholding_axi - level_1 - package_project - 2 - - user.org:user:thresholding_axi_wrapper:1.0 - - 2023-06-27T05:47:20Z - - - - - - 2022.2 - - - - - - - - - - - - - - diff --git a/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl b/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl deleted file mode 100644 index 90d73ede7e..0000000000 --- a/finn-rtllib/thresholding/gui/thresholding_axi_v1_0.gtcl +++ /dev/null @@ -1,4 +0,0 @@ -# This file is automatically written. Do not modify. -proc gen_USERPARAMETER_CF_VALUE {C PE } {expr $C/$PE} -proc gen_USERPARAMETER_ADDR_BITS_VALUE {C PE N } {expr int(ceil(log($C/$PE)/log(2))+ceil(log($PE)/log(2))+$N+2)} -proc gen_USERPARAMETER_O_BITS_VALUE {BIAS N } {expr int(ceil($BIAS >= 0? log(pow(2,$N)+$BIAS)/log(2) : 1+log(-$BIAS >= pow(2,$N-1)? -$BIAS : pow(2,$N)+$BIAS)/log(2)))} diff --git a/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl b/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl deleted file mode 100644 index 338304fa40..0000000000 --- a/finn-rtllib/thresholding/xgui/thresholding_axi_v1_0.tcl +++ /dev/null @@ -1,187 +0,0 @@ - -# Loading additional proc with user specified bodies to compute parameter values. -source [file join [file dirname [file dirname [info script]]] gui/thresholding_axi_v1_0.gtcl] - -# Definitional proc to organize widgets for parameters. -proc init_gui { IPINST } { - ipgui::add_param $IPINST -name "Component_Name" - #Adding Page - set Page_0 [ipgui::add_page $IPINST -name "Page 0"] - ipgui::add_param $IPINST -name "ADDR_BITS" -parent ${Page_0} - ipgui::add_param $IPINST -name "BIAS" -parent ${Page_0} - ipgui::add_param $IPINST -name "C" -parent ${Page_0} - ipgui::add_param $IPINST -name "CF" -parent ${Page_0} - ipgui::add_param $IPINST -name "FPARG" -parent ${Page_0} - ipgui::add_param $IPINST -name "K" -parent ${Page_0} - ipgui::add_param $IPINST -name "N" -parent ${Page_0} - ipgui::add_param $IPINST -name "O_BITS" -parent ${Page_0} - set PE [ipgui::add_param $IPINST -name "PE" -parent ${Page_0}] - set_property tooltip {PE Count} ${PE} - ipgui::add_param $IPINST -name "SIGNED" -parent ${Page_0} - - -} - -proc update_PARAM_VALUE.ADDR_BITS { PARAM_VALUE.ADDR_BITS PARAM_VALUE.C PARAM_VALUE.PE PARAM_VALUE.N } { - # Procedure called to update ADDR_BITS when any of the dependent parameters in the arguments change - - set ADDR_BITS ${PARAM_VALUE.ADDR_BITS} - set C ${PARAM_VALUE.C} - set PE ${PARAM_VALUE.PE} - set N ${PARAM_VALUE.N} - set values(C) [get_property value $C] - set values(PE) [get_property value $PE] - set values(N) [get_property value $N] - set_property value [gen_USERPARAMETER_ADDR_BITS_VALUE $values(C) $values(PE) $values(N)] $ADDR_BITS -} - -proc validate_PARAM_VALUE.ADDR_BITS { PARAM_VALUE.ADDR_BITS } { - # Procedure called to validate ADDR_BITS - return true -} - -proc update_PARAM_VALUE.CF { PARAM_VALUE.CF PARAM_VALUE.C PARAM_VALUE.PE } { - # Procedure called to update CF when any of the dependent parameters in the arguments change - - set CF ${PARAM_VALUE.CF} - set C ${PARAM_VALUE.C} - set PE ${PARAM_VALUE.PE} - set values(C) [get_property value $C] - set values(PE) [get_property value $PE] - set_property value [gen_USERPARAMETER_CF_VALUE $values(C) $values(PE)] $CF -} - -proc validate_PARAM_VALUE.CF { PARAM_VALUE.CF } { - # Procedure called to validate CF - return true -} - -proc update_PARAM_VALUE.O_BITS { PARAM_VALUE.O_BITS PARAM_VALUE.BIAS PARAM_VALUE.N } { - # Procedure called to update O_BITS when any of the dependent parameters in the arguments change - - set O_BITS ${PARAM_VALUE.O_BITS} - set BIAS ${PARAM_VALUE.BIAS} - set N ${PARAM_VALUE.N} - set values(BIAS) [get_property value $BIAS] - set values(N) [get_property value $N] - set_property value [gen_USERPARAMETER_O_BITS_VALUE $values(BIAS) $values(N)] $O_BITS -} - -proc validate_PARAM_VALUE.O_BITS { PARAM_VALUE.O_BITS } { - # Procedure called to validate O_BITS - return true -} - -proc update_PARAM_VALUE.BIAS { PARAM_VALUE.BIAS } { - # Procedure called to update BIAS when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.BIAS { PARAM_VALUE.BIAS } { - # Procedure called to validate BIAS - return true -} - -proc update_PARAM_VALUE.C { PARAM_VALUE.C } { - # Procedure called to update C when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.C { PARAM_VALUE.C } { - # Procedure called to validate C - return true -} - -proc update_PARAM_VALUE.FPARG { PARAM_VALUE.FPARG } { - # Procedure called to update FPARG when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.FPARG { PARAM_VALUE.FPARG } { - # Procedure called to validate FPARG - return true -} - -proc update_PARAM_VALUE.K { PARAM_VALUE.K } { - # Procedure called to update K when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.K { PARAM_VALUE.K } { - # Procedure called to validate K - return true -} - -proc update_PARAM_VALUE.N { PARAM_VALUE.N } { - # Procedure called to update N when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.N { PARAM_VALUE.N } { - # Procedure called to validate N - return true -} - -proc update_PARAM_VALUE.PE { PARAM_VALUE.PE } { - # Procedure called to update PE when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.PE { PARAM_VALUE.PE } { - # Procedure called to validate PE - return true -} - -proc update_PARAM_VALUE.SIGNED { PARAM_VALUE.SIGNED } { - # Procedure called to update SIGNED when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.SIGNED { PARAM_VALUE.SIGNED } { - # Procedure called to validate SIGNED - return true -} - - -proc update_MODELPARAM_VALUE.N { MODELPARAM_VALUE.N PARAM_VALUE.N } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.N}] ${MODELPARAM_VALUE.N} -} - -proc update_MODELPARAM_VALUE.K { MODELPARAM_VALUE.K PARAM_VALUE.K } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.K}] ${MODELPARAM_VALUE.K} -} - -proc update_MODELPARAM_VALUE.C { MODELPARAM_VALUE.C PARAM_VALUE.C } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.C}] ${MODELPARAM_VALUE.C} -} - -proc update_MODELPARAM_VALUE.PE { MODELPARAM_VALUE.PE PARAM_VALUE.PE } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.PE}] ${MODELPARAM_VALUE.PE} -} - -proc update_MODELPARAM_VALUE.SIGNED { MODELPARAM_VALUE.SIGNED PARAM_VALUE.SIGNED } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.SIGNED}] ${MODELPARAM_VALUE.SIGNED} -} - -proc update_MODELPARAM_VALUE.FPARG { MODELPARAM_VALUE.FPARG PARAM_VALUE.FPARG } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.FPARG}] ${MODELPARAM_VALUE.FPARG} -} - -proc update_MODELPARAM_VALUE.BIAS { MODELPARAM_VALUE.BIAS PARAM_VALUE.BIAS } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.BIAS}] ${MODELPARAM_VALUE.BIAS} -} - -proc update_MODELPARAM_VALUE.CF { MODELPARAM_VALUE.CF PARAM_VALUE.CF } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.CF}] ${MODELPARAM_VALUE.CF} -} - -proc update_MODELPARAM_VALUE.ADDR_BITS { MODELPARAM_VALUE.ADDR_BITS PARAM_VALUE.ADDR_BITS } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.ADDR_BITS}] ${MODELPARAM_VALUE.ADDR_BITS} -} - -proc update_MODELPARAM_VALUE.O_BITS { MODELPARAM_VALUE.O_BITS PARAM_VALUE.O_BITS } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.O_BITS}] ${MODELPARAM_VALUE.O_BITS} -} From 3fd52600a9e095c04084b300e46fdb8fb4aee553 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 2 Feb 2024 15:56:47 +0000 Subject: [PATCH 112/291] [CustomOp] do not allow ram_style for threshold RTL --- .../custom_op/fpgadataflow/hls/thresholding_hls.py | 5 ++++- src/finn/custom_op/fpgadataflow/thresholding.py | 12 ------------ 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 1cd5f4d3ed..16dee92e8a 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -59,7 +59,10 @@ def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = {} + my_attrs = { + # string defining memory type + "ram_style": ("s", False, "distributed", {"distributed", "block"}), + } my_attrs.update(Thresholding.get_nodeattr_types(self)) my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index 8494cf97bb..945ec16cf0 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -50,18 +50,6 @@ def get_nodeattr_types(self): # writable through an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. "runtime_writeable_weights": ("i", False, 0, {0, 1}), - # FPGA resource type for memories in decoupled mode - # auto -- let Vivado decide - # block -- use BRAM - # distributed -- use LUTRAM - # ultra -- use UltraRAM (URAM), must have runtime_writeable_weights=1 - # see also https://www.xilinx.com/support/answers/38070.html - "ram_style": ( - "s", - False, - "auto", - {"auto", "block", "distributed", "ultra"}, - ), # parallelization; channels thresholded per cycle "PE": ("i", True, 0), # number of channels (each may have different thresholds) From c8281c33d47bc084375dbf4d7b21cf69c261ee57 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 2 Feb 2024 15:58:37 +0000 Subject: [PATCH 113/291] Revert "[Pyverilator] update to new rtlsim_multi_io implementation" This reverts commit c54d32ce2fb619cfd231579dd2b8f0ddcf711983. --- fetch-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch-repos.sh b/fetch-repos.sh index ba7cd28a00..1275ccf31c 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -30,7 +30,7 @@ QONNX_COMMIT="47e4357faf66b5b0d1bf77bf908bb47752421e5b" FINN_EXP_COMMIT="de99347e936d51715f5356a1b6c64e37b91c23c2" BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db" -PYVERILATOR_COMMIT="fc2dd96ac07c5a23897af8f0b0339135e12fa0ba" +PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3" OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a" From 3f3b7c5e6972a0ade4f8e3488152e2febe4d2325 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 2 Feb 2024 16:52:41 +0000 Subject: [PATCH 114/291] [CustomOp] Thresholding node must explicitly reset_rtlsim --- src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 50e30efc4f..48aeb0b9f8 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -31,7 +31,7 @@ import os import shutil import warnings -from pyverilator.util.axi_utils import rtlsim_multi_io +from pyverilator.util.axi_utils import rtlsim_multi_io, reset_rtlsim from qonnx.core.datatype import DataType from qonnx.util.basic import ( interleave_matrix_outer_dim_from_partitions, @@ -603,13 +603,13 @@ def rtlsim_multi_io(self, sim, io_dict): if trace_file == "default": trace_file = self.onnx_node.name + ".vcd" num_out_values = self.get_number_output_values() + reset_rtlsim(sim) total_cycle_count = rtlsim_multi_io( sim, io_dict, num_out_values, trace_file=trace_file, sname=sname, - do_reset=True, liveness_threshold=pyverilate_get_liveness_threshold_cycles(), ) self.set_nodeattr("cycles_rtlsim", total_cycle_count) From 84ec9eadb5a8c2a19db26453084fe355069b58b8 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 2 Feb 2024 17:22:43 +0000 Subject: [PATCH 115/291] [CustomOp/Transform] Fix linting and cleanup --- .../analysis/fpgadataflow/res_estimation.py | 6 +- .../custom_op/fpgadataflow/hls/__init__.py | 10 ++- .../hls/matrixvectoractivation_hls.py | 33 ++------- .../hls/vectorvectoractivation_hls.py | 22 ++---- .../fpgadataflow/matrixvectoractivation.py | 70 +++++++------------ .../fpgadataflow/vectorvectoractivation.py | 41 +++++------ .../fpgadataflow/convert_to_hw_layers.py | 5 +- .../fpgadataflow/create_stitched_ip.py | 4 +- .../transformation/fpgadataflow/floorplan.py | 2 +- .../transformation/fpgadataflow/insert_dwc.py | 2 +- .../fpgadataflow/insert_iodma.py | 3 +- .../fpgadataflow/insert_tlastmarker.py | 4 +- .../fpgadataflow/make_pynq_driver.py | 4 +- .../fpgadataflow/make_zynq_proj.py | 4 +- .../fpgadataflow/set_fifo_depths.py | 8 +-- .../fpgadataflow/set_folding.py | 2 +- tests/fpgadataflow/test_fpgadataflow_mvau.py | 10 +-- 17 files changed, 90 insertions(+), 140 deletions(-) diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index a7f220daa9..d48c423b9d 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -61,8 +61,10 @@ def res_estimation_complete(model): for node in model.graph.node: if is_fpgadataflow_node(node) is True: inst = registry.getCustomOp(node) - op_type = inst.base_op_type() - if op_type == "MatrixVectorActivation" or op_type == "VectorVectorActivation": + op_type = node.op_type + if op_type.startswith("MatrixVectorActivation") or op_type.startswith( + "VectorVectorActivation" + ): orig_restype = inst.get_nodeattr("resType") res_dict[node.name] = [] inst.set_nodeattr("resType", "dsp") diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index ebb5ce98da..1e2c83ba39 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -41,6 +41,9 @@ from finn.custom_op.fpgadataflow.hls.iodma_hls import IODMA_hls from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls +from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import ( + MatrixVectorActivation_hls, +) from finn.custom_op.fpgadataflow.hls.pool_hls import Pool_hls from finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls import ( StreamingDataWidthConverter_hls, @@ -50,8 +53,9 @@ from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls -from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MatrixVectorActivation_hls -from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VectorVectorActivation_hls +from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import ( + VectorVectorActivation_hls, +) custom_op = dict() @@ -78,4 +82,4 @@ custom_op["TLastMarker_hls"] = TLastMarker_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls -custom_op["VectorVectorActivation_hls"] = VectorVectorActivation_hls \ No newline at end of file +custom_op["VectorVectorActivation_hls"] = VectorVectorActivation_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index e27e77fe4f..5206ee3a06 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -26,26 +26,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import math import numpy as np import os -import textwrap -import warnings from qonnx.core.datatype import DataType -from qonnx.util.basic import ( - calculate_matvec_accumulator_range, - interleave_matrix_outer_dim_from_partitions, - roundup_to_integer_multiple, -) -from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) +from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -60,7 +47,7 @@ class MatrixVectorActivation_hls(MatrixVectorActivation, HLSBackend): def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) - + def get_nodeattr_types(self): my_attrs = {} my_attrs.update(MatrixVectorActivation.get_nodeattr_types(self)) @@ -480,17 +467,13 @@ def execute_node(self, context, graph): elif mode == "rtlsim": sim = self.get_rtlsim() nbits = self.get_instream_width() - inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) self.reset_rtlsim(sim) self.toggle_clk(sim) if mem_mode in ["external", "decoupled"]: wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() - wei = npy_to_rtlsim_input( - "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits - ) + wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) io_dict = { "inputs": {"in0": inp, "weights": wei * num_w_reps}, @@ -505,9 +488,7 @@ def execute_node(self, context, graph): packed_bits = self.get_outstream_width() out_npy_path = "{}/output.npy".format(code_gen_dir) out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) # load and reshape output output = np.load(out_npy_path) oshape = self.get_normal_output_shape() @@ -519,4 +500,4 @@ def execute_node(self, context, graph): has to be set to "rtlsim" """.format( mode ) - ) \ No newline at end of file + ) diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py index c824f9682c..7e475ff67f 100644 --- a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -26,26 +26,14 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import math import numpy as np import os -import textwrap -import warnings from qonnx.core.datatype import DataType -from qonnx.util.basic import ( - calculate_matvec_accumulator_range, - interleave_matrix_outer_dim_from_partitions, - roundup_to_integer_multiple, -) - -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) -from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation + from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + class VectorVectorActivation_hls(VectorVectorActivation, HLSBackend): """Corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" @@ -475,4 +463,4 @@ def get_verilog_top_module_intf_names(self): runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if runtime_writable: intf_names["axilite"] = ["s_axilite"] - return intf_names \ No newline at end of file + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 04594f4109..463a4effa8 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -28,35 +28,20 @@ import math import numpy as np -import os +import onnx.numpy_helper as np_helper +import qonnx.custom_op.general.xnorpopcount as xp import textwrap import warnings -from onnx import TensorProto, helper from qonnx.core.datatype import DataType -import qonnx.custom_op.general.xnorpopcount as xp from qonnx.custom_op.general.multithreshold import multithreshold -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.registry import getCustomOp from qonnx.util.basic import ( calculate_matvec_accumulator_range, interleave_matrix_outer_dim_from_partitions, roundup_to_integer_multiple, - qonnx_make_model ) from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) -import qonnx.core.data_layout as DataLayout -import finn.core.onnx_exec as oxe -from qonnx.transformation.infer_shapes import InferShapes -import onnx.numpy_helper as np_helper -from qonnx.transformation.general import GiveUniqueNodeNames - +from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -133,7 +118,7 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), - } + } my_attrs.update(super().get_nodeattr_types()) return my_attrs @@ -147,11 +132,15 @@ def execute_node(self, context, graph): mvau_w = np_helper.to_array(mvau_w_init) # Matrix multiplication if self.get_nodeattr("binaryXnorMode"): - # Note: activation/weights are expected to be binary (by design coming from the transformation inferring this operation mode) + # Note: activation/weights are expected to be binary + # (by design coming from the transformation inferring this operation mode) result = xp.xnorpopcountmatmul(in_act, mvau_w) - elif (self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR"): + elif ( + self.get_nodeattr("inputDataType") == "BIPOLAR" + and self.get_nodeattr("weightDataType") == "BIPOLAR" + ): # Convert to binary and use xnorpopcountmatmul function - result = xp.xnorpopcountmatmul((in_act+1)/2, (mvau_w+1)/2) + result = xp.xnorpopcountmatmul((in_act + 1) / 2, (mvau_w + 1) / 2) else: # Regular matrix multiplication result = np.matmul(in_act, mvau_w) @@ -162,11 +151,11 @@ def execute_node(self, context, graph): out_scale = 2 if odt_is_bipolar else 1 out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") # NHWC to NCHW for multithreshold node - result = result.transpose((0,3,1,2)) + result = result.transpose((0, 3, 1, 2)) result = multithreshold(result, mvau_thr, out_scale, out_bias) # NCHW to NHWC - result = result.transpose((0,2,3,1)) - + result = result.transpose((0, 2, 3, 1)) + context[node.output[0]] = result def verify_node(self): @@ -260,20 +249,22 @@ def get_weight_datatype(self): def get_accumulator_datatype(self): """Returns FINN DataType of accumulator""" - return DataType[self.get_nodeattr("accDataType")] + return DataType[self.get_nodeattr("accDataType")] def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] + return DataType[self.get_nodeattr("outputDataType")] def get_instream_width(self, ind=0): i_bits = self.get_input_datatype().bitwidth() - assert ( - i_bits <= 9 - ), "RTL-based MVAU only supports activations with bit-width up to 9-bits" in_width = i_bits * self.get_nodeattr("SIMD") return in_width + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + out_width = o_bits * self.get_nodeattr("PE") + return out_width + def get_weightstream_width(self): """Returns weight stream width. Used only in decoupled mode.""" if ( @@ -283,19 +274,11 @@ def get_weightstream_width(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") wp = self.get_weight_datatype().bitwidth() - assert ( - wp <= 8 - ), "RTL-based MVAU only supports weights with bit-width up to 8-bits" w_width = pe * simd * wp return w_width else: return 0 - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - out_width = o_bits * self.get_nodeattr("PE") - return out_width - def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required by the AXI Stream spec. Used in decoupled mode.""" @@ -964,8 +947,7 @@ def code_generation_ipi(self): cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) cmd.append( "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" - % (node_name, dout_name) + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) ) cmd.append( "create_bd_intf_pin -mode Slave " @@ -981,8 +963,7 @@ def code_generation_ipi(self): strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (strm_vlnv, node_name, strm_inst) + "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst) ) cmd.append( "set_property -dict [list " @@ -1036,8 +1017,7 @@ def code_generation_ipi(self): axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] cmd.append( "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" - % (node_name, axilite_name) + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name) ) cmd.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " @@ -1052,4 +1032,4 @@ def code_generation_ipi(self): return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") - return cmd \ No newline at end of file + return cmd diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index e793321879..79265f8daa 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -28,10 +28,11 @@ import math import numpy as np -import os +import onnx.numpy_helper as np_helper import textwrap import warnings from qonnx.core.datatype import DataType +from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.util.basic import ( calculate_matvec_accumulator_range, interleave_matrix_outer_dim_from_partitions, @@ -39,16 +40,7 @@ ) from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) -import onnx.numpy_helper as np_helper -import qonnx.custom_op.general.xnorpopcount as xp -from qonnx.custom_op.general.multithreshold import multithreshold - +from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string class VectorVectorActivation(HWCustomOp): @@ -131,19 +123,22 @@ def execute_node(self, context, graph): # Reorder the input activations. Note that PE gets interleaved by the SWG, # so we have to untangle and for simplicity of computation assume pe=1. # Note that PE has no effect on the QONNX node - in_act = in_act.reshape(1, dim_h, dim_w, channels // pe, k_h*k_w, pe) + in_act = in_act.reshape(1, dim_h, dim_w, channels // pe, k_h * k_w, pe) in_act = in_act.transpose(0, 1, 2, 4, 3, 5) - in_act = in_act.reshape(1, dim_h, dim_w, channels*k_h*k_w) + in_act = in_act.reshape(1, dim_h, dim_w, channels * k_h * k_w) # Reshape weights in appropriate format vvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] vvau_w = np_helper.to_array(vvau_w_init) vvau_w_onnx = self._infer_sparse_weight_tensor(vvau_w, k_h, k_w, channels) - if self.get_nodeattr("inputDataType") == "BIPOLAR" and self.get_nodeattr("weightDataType") == "BIPOLAR": - result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format - result = (result + k_h*k_w) / 2 + if ( + self.get_nodeattr("inputDataType") == "BIPOLAR" + and self.get_nodeattr("weightDataType") == "BIPOLAR" + ): + result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format + result = (result + k_h * k_w) / 2 else: - result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format + result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format if self.get_nodeattr("noActivation") == 0: vvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] @@ -152,16 +147,16 @@ def execute_node(self, context, graph): out_scale = 2 if odt_is_bipolar else 1 out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") # NHWC to NCHW for multithreshold node - result = result.transpose((0,3,1,2)) + result = result.transpose((0, 3, 1, 2)) result = multithreshold(result, vvau_thr, out_scale, out_bias) # NCHW to NHWC - result = result.transpose((0,2,3,1)) - + result = result.transpose((0, 2, 3, 1)) + context[node.output[0]] = result def verify_node(self): pass - + def make_shape_compatible_op(self, model): oshape = self.get_normal_output_shape() return super().make_const_shape_op(oshape) @@ -203,7 +198,7 @@ def get_instream_width(self, ind=0): pe = self.get_nodeattr("PE") in_width = i_bits * simd * pe return in_width - + def get_weightstream_width(self): """Returns weight stream width. Used only in decoupled mode.""" if ( @@ -962,4 +957,4 @@ def code_generation_ipi(self): return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for VectorVectorActivation") - return cmd \ No newline at end of file + return cmd diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 26cd0b74ad..ade76afdde 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1281,6 +1281,7 @@ def apply(self, model): return (model, graph_modified) + class InferBinaryMatrixVectorActivation(Transformation): """Convert XnorPopcountMatMul layers to MatrixVectorActivation layers. Any immediately following MultiThreshold @@ -1414,6 +1415,7 @@ def apply(self, model): model = model.transform(InferDataTypes()) return (model, graph_modified) + class InferQuantizedMatrixVectorActivation(Transformation): """Convert MatMul layers with quantized inputs and weights to MatrixVectorActivation layers.""" @@ -1550,6 +1552,7 @@ def apply(self, model): model = model.transform(InferDataTypes()) return (model, graph_modified) + class InferVectorVectorActivation(Transformation): """Convert MatMul layers with quantized inputs and weights to VectorVectorActivation layers, if the sparsity annotation @@ -1694,4 +1697,4 @@ def apply(self, model): if graph_modified: model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) - return (model, graph_modified) \ No newline at end of file + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 81c5848d57..1c316e1285 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -48,13 +48,13 @@ def is_external_input(model, node, i): # True only if input is unconnected and has no initializer # Only esception is second input of FC layers when mem_mode is external node_inst = getCustomOp(node) - op_type = node_inst.base_op_type() + op_type = node.op_type producer = model.find_producer(node.input[i]) if producer is None: if model.get_initializer(node.input[i]) is None: return True else: - if op_type == "MatrixVectorActivation": + if op_type.startswith("MatrixVectorActivation"): if node_inst.get_nodeattr("mem_mode") == "external": return True return False diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index 56e644f2b8..6149dffd59 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -150,7 +150,7 @@ def apply(self, model): continue elif not ( - node_inst.base_op_type() == "MatrixVectorActivation" + node.op_type.startswith("MatrixVectorActivation") and node_inst.get_nodeattr("mem_mode") is not None and node_inst.get_nodeattr("mem_mode") == "external" ): diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index d0029cb630..f6dd587c76 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -88,7 +88,7 @@ def apply(self, model): # - if FC and external mem, it could be connected to input 1 # - if concat, could be connected to any input if ( - n1.base_op_type() == "MatrixVectorActivation" + consumer.op_type.startswith("MatrixVectorActivation") and n1.get_nodeattr("mem_mode") == "external" ) or (consumer.op_type == "StreamingConcat"): # get input idx diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index fd546459fa..f3334d94f5 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -199,7 +199,8 @@ def apply(self, model): # attached IODMA fc_extw_nodes = list( filter( - lambda x: getCustomOp(x).base_op_type() in ["MatrixVectorActivation", "VectorVectorActivation"] + lambda x: x.op_type + in ["MatrixVectorActivation_hls", "VectorVectorActivation_hls"] and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index ab5142e4d8..fbb64428aa 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -103,7 +103,7 @@ def apply(self, model): # the input is in the list of graph inputs because it has an # initializer (TODO: fix this with a clean-up transform) if ( - getCustomOp(first_node).base_op_type() == "MatrixVectorActivation" + first_node.op_type.startswith("MatrixVectorActivation") and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8") != "external" ): @@ -117,7 +117,7 @@ def apply(self, model): num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) inp_idx = list(first_node.input).index(graph_in_name) if inp_idx > 0: - if getCustomOp(first_node).base_op_type() == "MatrixVectorActivation" and inp_idx == 1: + if first_node.op_type.startswith("MatrixVectorActivation") and inp_idx == 1: stream_width = int(custom_op.get_weightstream_width()) elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1: stream_width = int(custom_op.get_instream_width()) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index e66236bf39..9a5317e588 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -282,7 +282,9 @@ def apply(self, model): dataflow_model = ModelWrapper(dataflow_model_filename) rt_layer_ind = 0 for node in dataflow_model.graph.node: - if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch": + if node.op_type.startswith("MatrixVectorActivation") or node.op_type.startswith( + "Thresholding" + ): node_inst = getCustomOp(node) is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") if is_rt_weights == 1: diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 193e6e8b42..2f58064f11 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -62,7 +62,9 @@ def collect_ip_dirs(model, ipstitch_path): ), """The directory that should contain the generated ip blocks doesn't exist.""" ip_dirs += [ip_dir_value] - if getCustomOp(node).base_op_type() == "MatrixVectorActivation" or node.op_type == "Thresholding_Batch": + if node.op_type.startswith("MatrixVectorActivation") or node.op_type.startswith( + "Thresholding" + ): if node_inst.get_nodeattr("mem_mode") == "decoupled": need_memstreamer = True ip_dirs += [ipstitch_path + "/ip"] diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 84a8084832..f2aefc25dd 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -174,7 +174,7 @@ def apply(self, model): continue if fifo_cons is None: continue - if getCustomOp(fifo_cons).base_op_type() != "MatrixVectorActivation": + if not fifo_cons.op_type.startswith("MatrixVectorActivation"): continue op_inst = getCustomOp(node) depth = op_inst.get_nodeattr("depth") @@ -257,7 +257,7 @@ def __init__( def apply(self, model): # these optypes may potentially use external weights # we'll temporarily change them to use decoupled mode for FIFO sizing - extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"] + extw_optypes = ["MatrixVectorActivation_hls", "VectorVectorActivation_hls"] # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] @@ -281,7 +281,7 @@ def apply(self, model): node.set_nodeattr("inFIFODepths", ifd) node.set_nodeattr("outFIFODepths", ofd) - if getCustomOp(node).base_op_type() in extw_optypes: + if node.op_type in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) @@ -422,7 +422,7 @@ def apply(self, model): # (removed setting of node FIFO size attributes to 0 here) # for every extw node we changed from external to decoupled, # change back and reset implementation - if getCustomOp(node).base_op_type() in extw_optypes: + if node.op_type in extw_optypes: if node.name in modified_fc_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("mem_mode", "external") diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 7b65023abc..62457f164a 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -125,7 +125,7 @@ def apply(self, model): continue op_type = node.op_type node_inst = getCustomOp(node) - if node_inst.base_op_type() == "MatrixVectorActivation": + if op_type.startswith("MatrixVectorActivation"): max_simd = node_inst.get_nodeattr("MW") max_pe = node_inst.get_nodeattr("MH") node_inst.set_nodeattr("PE", 1) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 38f77e3836..7e632b4018 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -52,8 +52,6 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from qonnx.transformation.general import ApplyConfig, GiveUniqueNodeNames, GiveReadableTensorNames -from qonnx.transformation.infer_shapes import InferShapes from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers @@ -401,15 +399,9 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_hls_0" in hls_synt_res_est - assert "MatrixVectorActivation_hls_0" in hls_synt_res_est - else: - assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est assert "MatrixVectorActivation_hls_0" in hls_synt_res_est - else: - assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] + node = model.get_nodes_by_op_type("MatrixVectorActivation_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) From 05881df6baad3b0dc1bbc683c446830a7a5c6101 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 7 Feb 2024 15:30:22 +0000 Subject: [PATCH 116/291] [Util] Introduce new functions to check if node is hls or rtl --- .../custom_op/fpgadataflow/hlscustomop.py | 868 ------------------ .../fpgadataflow/compile_cppsim.py | 7 +- .../fpgadataflow/hlssynth_ip.py | 9 +- .../fpgadataflow/prepare_cppsim.py | 7 +- .../fpgadataflow/set_exec_mode.py | 18 +- src/finn/util/fpgadataflow.py | 28 + 6 files changed, 54 insertions(+), 883 deletions(-) delete mode 100644 src/finn/custom_op/fpgadataflow/hlscustomop.py diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hlscustomop.py deleted file mode 100644 index 4fed8ed4b5..0000000000 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ /dev/null @@ -1,868 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import subprocess -import warnings -from abc import abstractmethod -from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io -from qonnx.core.datatype import DataType -from qonnx.custom_op.base import CustomOp -from qonnx.util.basic import roundup_to_integer_multiple - -from finn.util.basic import ( - CppBuilder, - get_rtlsim_trace_depth, - make_build_dir, - pyverilate_get_liveness_threshold_cycles, -) -from finn.util.hls import CallHLS -from finn.util.pyverilator import make_single_source_file - -from . import templates - -try: - from pyverilator import PyVerilator -except ModuleNotFoundError: - PyVerilator = None - - -class HLSCustomOp(CustomOp): - """HLSCustomOp class all custom ops that correspond to a finn-hlslib - function are based on. Contains different functions every fpgadataflow - custom node should have. Some as abstract methods, these have to be filled - when writing a new fpgadataflow custom op node.""" - - def __init__(self, onnx_node, **kwargs): - super().__init__(onnx_node, **kwargs) - - self.code_gen_dict = {} - - # getting templates from templates.py - - # template for single node execution - self.docompute_template = templates.docompute_template - - # templates for single node ip generation - # cpp file - self.ipgen_template = templates.ipgen_template - # tcl script - self.ipgentcl_template = templates.ipgentcl_template - - def get_nodeattr_types(self): - return { - "backend": ("s", True, "fpgadataflow"), - "code_gen_dir_cppsim": ("s", False, ""), - "code_gen_dir_ipgen": ("s", False, ""), - "executable_path": ("s", False, ""), - "ipgen_path": ("s", False, ""), - "ip_path": ("s", False, ""), - "ip_vlnv": ("s", False, ""), - "exec_mode": ("s", False, "", {"", "rtlsim", "cppsim"}), - "cycles_rtlsim": ("i", False, 0), - "cycles_estimate": ("i", False, 0), - "rtlsim_trace": ("s", False, ""), - "res_estimate": ("s", False, ""), - "res_hls": ("s", False, ""), - "res_synth": ("s", False, ""), - "rtlsim_so": ("s", False, ""), - # partitioning info - # ID of SLR to which the Op is attached in Vitis builds - # Set to -1 as 'don't care' - "slr": ("i", False, -1), - # Vitis memory port to which any AXI-MM interface - # of this Op should be attached in Vitis builds - # E.g.: "DDR[0]", "HBM[0]", "PLRAM[0]" - "mem_port": ("s", False, ""), - # Partition to which the Op belongs; all Ops with the - # same partition_id are stitched together - # Users should avoid setting this attribute manually - # and instead use the floorplan transform to set - # partition IDs from Vitis design rules and SLR IDs - "partition_id": ("i", False, 0), - # ID of FPGA device to which this Op is allocated, in - # a multi-FPGA setting - "device_id": ("i", False, 0), - # input and output FIFO depths for multi-I/O nodes - "inFIFODepths": ("ints", False, [2]), - "outFIFODepths": ("ints", False, [2]), - "output_hook": ("s", False, ""), - # accumulated characteristic function over two periods - "io_chrc_in": ("t", False, np.asarray([], dtype=np.int32)), - "io_chrc_out": ("t", False, np.asarray([], dtype=np.int32)), - # the period for which the characterization was run - "io_chrc_period": ("i", False, 0), - # amount of zero padding inserted during chrc. - "io_chrc_pads_in": ("ints", False, []), - "io_chrc_pads_out": ("ints", False, []), - } - - def get_verilog_top_module_name(self): - "Return the Verilog top module name for this node." - - node = self.onnx_node - prefixed_top_name = node.name - - return prefixed_top_name - - def get_verilog_top_module_intf_names(self): - """Return a dict of names of input and output interfaces. - The keys reflect the protocols each interface implements: - 'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'. - Values are lists of tuples (axis, aximm) or names (axilite): - 'axis' tuples correspond to the list of node inputs in order, - each tuple is (interface_name, interface_width_bits). - axilite always assumed to be 32 bits and is not tuple (name only). - Each block must have at most one aximm and one axilite.""" - intf_names = {} - intf_names["clk"] = ["ap_clk"] - intf_names["rst"] = ["ap_rst_n"] - sname = self.hls_sname() - intf_names["s_axis"] = [("in0_" + sname, self.get_instream_width_padded())] - intf_names["m_axis"] = [("out_" + sname, self.get_outstream_width_padded())] - intf_names["aximm"] = [] - intf_names["axilite"] = [] - intf_names["ap_none"] = [] - return intf_names - - def get_verilog_top_filename(self): - "Return the Verilog top module filename for this node." - - verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format( - self.get_nodeattr("code_gen_dir_ipgen"), - self.onnx_node.name, - self.get_verilog_top_module_name(), - ) - return verilog_file - - def get_all_verilog_paths(self): - "Return list of all folders containing Verilog code for this node." - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - assert ( - code_gen_dir != "" - ), """Node attribute "code_gen_dir_ipgen" is - not set. Please run HLSSynthIP first.""" - verilog_path = "{}/project_{}/sol1/impl/verilog/".format(code_gen_dir, self.onnx_node.name) - # default impl only returns the HLS verilog codegen dir - return [verilog_path] - - def get_all_verilog_filenames(self, abspath=False): - "Return list of all Verilog files used for this node." - - verilog_files = [] - verilog_paths = self.get_all_verilog_paths() - for verilog_path in verilog_paths: - for f in os.listdir(verilog_path): - if f.endswith(".v"): - if abspath: - verilog_files += [verilog_path + "/" + f] - else: - verilog_files += [f] - return verilog_files - - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - verilog_files = self.get_all_verilog_filenames(abspath=True) - single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_") - tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") - target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v" - make_single_source_file(verilog_files, target_file) - - # build the Verilator emu library - sim = PyVerilator.build( - self.get_verilog_top_module_name() + ".v", - build_dir=tmp_build_dir, - verilog_path=[single_src_dir], - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim - - def get_rtlsim(self): - """Return a PyVerilator wrapper for the Verilator emulation library - for this node.""" - - rtlsim_so = self.get_nodeattr("rtlsim_so") - assert os.path.isfile(rtlsim_so), "Cannot find rtlsim library." - # create PyVerilator wrapper - sim = PyVerilator(rtlsim_so) - return sim - - def node_res_estimation(self): - """Returns summarized resource estimation of BRAMs and LUTs - of the node as a dictionary.""" - ret = dict() - ret["BRAM_18K"] = self.bram_estimation() - ret["BRAM_efficiency"] = self.bram_efficiency_estimation() - ret["LUT"] = self.lut_estimation() - ret["URAM"] = self.uram_estimation() - ret["URAM_efficiency"] = self.uram_efficiency_estimation() - ret["DSP"] = self.dsp_estimation() - return ret - - def bram_efficiency_estimation(self): - """Function for BRAM efficiency estimation: actual parameter storage - needed divided by the allocated BRAM storage (from estimation)""" - return 1 - - def uram_efficiency_estimation(self): - """Function for URAM efficiency estimation: actual parameter storage - needed divided by the allocated URAM storage (from estimation)""" - return 1 - - def bram_estimation(self): - """Function for BRAM resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" - return 0 - - def uram_estimation(self): - """Function for UltraRAM resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" - return 0 - - def lut_estimation(self): - """Function for LUT resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" - return 0 - - def dsp_estimation(self): - """Function for DSP resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" - return 0 - - def get_exp_cycles(self): - """Function for estimation of expected cycles for set folding, - is member function of HLSCustomOp class but has to be filled - by every node""" - return 0 - - def get_op_and_param_counts(self): - """Return a dictionary with number of ops needed per inference for - this layer as well as parameter count (weights, thresholds, etc.). - Entries should be in the format: - {op_ : , param_: }.""" - return {} - - def code_generation_ipgen(self, model, fpgapart, clk): - """Generates c++ code and tcl script for ip generation.""" - node = self.onnx_node - - # generate top cpp file for ip generation - path = self.get_nodeattr("code_gen_dir_ipgen") - self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] - self.generate_params(model, path) - self.global_includes() - self.defines("ipgen") - self.blackboxfunction() - self.pragmas() - self.docompute() - - template = self.ipgen_template - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - f = open(os.path.join(code_gen_dir, "top_{}.cpp".format(node.name)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - # generate tcl script for ip generation - self.code_gen_dict["$PROJECTNAME$"] = ["project_{}".format(node.name)] - self.code_gen_dict["$HWSRCDIR$"] = [code_gen_dir] - self.code_gen_dict["$FPGAPART$"] = [fpgapart] - self.code_gen_dict["$TOPFXN$"] = [node.name] - self.code_gen_dict["$CLKPERIOD$"] = [str(clk)] - self.code_gen_dict["$DEFAULT_DIRECTIVES$"] = self.ipgen_default_directives() - self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives() - - template = self.ipgentcl_template - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - f = open(os.path.join(code_gen_dir, "hls_syn_{}.tcl".format(node.name)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - def ipgen_default_directives(self): - """Return list of default HLS synthesis directives""" - - default_directives = [ - "set_param hls.enable_hidden_option_error false", - "config_compile -disable_unroll_code_size_check -pipeline_style flp", - "config_interface -m_axi_addr64", - "config_rtl -module_auto_prefix", - "config_rtl -deadlock_detection none", - ] - return default_directives - - def ipgen_extra_directives(self): - "Return a list of extra tcl directives for HLS synthesis." - return [] - - def ipgen_singlenode_code(self): - """Builds the bash script for IP generation using the CallHLS utility.""" - node = self.onnx_node - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - builder = CallHLS() - builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name)) - builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name)) - builder.build(code_gen_dir) - ipgen_path = builder.ipgen_path - assert os.path.isdir(ipgen_path), "IPGen failed: %s not found" % (ipgen_path) - self.set_nodeattr("ipgen_path", ipgen_path) - ip_path = ipgen_path + "/sol1/impl/ip" - assert os.path.isdir(ip_path), "IPGen failed: %s not found. Check log under %s" % ( - ip_path, - code_gen_dir, - ) - self.set_nodeattr("ip_path", ip_path) - vlnv = "xilinx.com:hls:%s:1.0" % node.name - self.set_nodeattr("ip_vlnv", vlnv) - - def code_generation_cppsim(self, model): - """Generates c++ code for simulation (cppsim).""" - node = self.onnx_node - path = self.get_nodeattr("code_gen_dir_cppsim") - self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] - self.generate_params(model, path) - self.global_includes() - self.defines("cppsim") - self.read_npy_data() - self.strm_decl() - self.pragmas() - self.docompute() - self.dataoutstrm() - self.save_as_npy() - - template = self.docompute_template - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - def code_generation_ipi(self): - """Constructs and returns the TCL for node instantiation in Vivado IPI.""" - vlnv = self.get_nodeattr("ip_vlnv") - cmd = ["create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)] - return cmd - - def compile_singlenode_code(self): - """Builds the bash script for compilation using the CppBuilder from - finn.util.basic and executes the script to produce the executable.""" - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - builder = CppBuilder() - # to enable additional debug features please uncommand the next line - # builder.append_includes("-DDEBUG") - builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp") - builder.append_includes("-I$FINN_ROOT/deps/cnpy/") - builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib") - builder.append_includes("-I$FINN_ROOT/custom_hls") - builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"])) - builder.append_includes("--std=c++14") - builder.append_includes("-O3") - builder.append_sources(code_gen_dir + "/*.cpp") - builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp") - builder.append_includes("-lz") - builder.set_executable_path(code_gen_dir + "/node_model") - builder.build(code_gen_dir) - self.set_nodeattr("executable_path", builder.executable_path) - - def dynamic_input_to_npy(self, context, count, target_dir=""): - """Saves input (given context) into .npy files. - - Count indicates the number of inputs that have to be saved.""" - node = self.onnx_node - if target_dir == "": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - if code_gen_dir == "": - raise Exception( - """ - Found no codegen dir for this node, did you run the prepare_cppsim transformation? - """ - ) - target_dir = code_gen_dir - # create a npy file for each input of the node (in_ind is input index) - # assuming dynamic inputs start from 0 - for in_ind in range(count): - current_input_name = node.input[in_ind] - input_array = context[current_input_name] - if in_ind == 0: - expected_inp_shape = self.get_folded_input_shape() - idt = self.get_input_datatype() - else: - expected_inp_shape = self.get_folded_input_shape(in_ind) - idt = self.get_input_datatype(in_ind) - reshaped_input = input_array.reshape(expected_inp_shape) - if idt == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(target_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - - def npy_to_dynamic_output(self, context): - """Reads the output from an output.npy file generated from cppsim and - places its content into the context dictionary.""" - node = self.onnx_node - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - output = np.load("{}/output.npy".format(code_gen_dir)) - exp_shape = self.get_normal_output_shape() - context[node.output[0]] = output.reshape(exp_shape) - - def npy_to_dynamic_outputs(self, context, npy_list): - """Reads the output from .npy files generated from cppsim and places - their content into the context dictionary. - npy_list is a list specifying which files to read, and its order must - match the order of node outputs.""" - node = self.onnx_node - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - for i in range(len(npy_list)): - output = np.load("{}/{}".format(code_gen_dir, npy_list[i])) - if i == 0: - exp_shape = self.get_normal_output_shape() - else: - exp_shape = self.get_normal_output_shape(i) - context[node.output[i]] = output.reshape(exp_shape) - - def exec_precompiled_singlenode_model(self): - """Executes precompiled executable.""" - executable_path = self.get_nodeattr("executable_path") - if executable_path == "": - raise Exception( - """ -Found no executable for this node, did you run the codegen and -compilation transformations? - """ - ) - process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE) - process_execute.communicate() - - def reset_rtlsim(self, sim): - """Sets reset input in pyverilator to zero, toggles the clock and set it - back to one""" - sim.io.ap_rst_n = 0 - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 - sim.io.ap_rst_n = 1 - - def toggle_clk(self, sim): - """Toggles the clock input in pyverilator once.""" - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 - - def hls_sname(self): - """Get the naming convention used by Vitis HLS for stream signals - Example: the TDATA for a stream called "out" would be out_V_TDATA. - """ - return "V" - - def rtlsim(self, sim, inp, inp2=None): - """Runs the pyverilator simulation by passing the input values to the simulation, - toggle the clock and observing the execution time. Function contains also an - observation loop that can abort the simulation if no output value is produced - after 100 cycles.""" - - trace_file = self.get_nodeattr("rtlsim_trace") - if trace_file != "": - if trace_file == "default": - trace_file = self.onnx_node.name + ".vcd" - sim.start_vcd_trace(trace_file) - inputs = inp - outputs = [] - sname = self.hls_sname() - o_ready = "out_" + sname + "_TREADY" - o_valid = "out_" + sname + "_TVALID" - o_data = "out_" + sname + "_TDATA" - in0_ready = "in0_" + sname + "_TREADY" - in0_valid = "in0_" + sname + "_TVALID" - in0_data = "in0_" + sname + "_TDATA" - in1_ready = "in1_" + sname + "_TREADY" - in1_valid = "in1_" + sname + "_TVALID" - in1_data = "in1_" + sname + "_TDATA" - - sim.io[o_ready] = 1 - - # observe if output is completely calculated - # observation_count will contain the number of cycles the calculation ran - num_out_values = self.get_number_output_values() - output_observed = False - observation_count = 0 - - # avoid infinite looping of simulation by aborting when there is no change in - # output values after 100 cycles - no_change_count = 0 - old_outputs = outputs - liveness_threshold = pyverilate_get_liveness_threshold_cycles() - - while not (output_observed): - sim.io[in0_valid] = 1 if len(inputs) > 0 else 0 - sim.io[in0_data] = inputs[0] if len(inputs) > 0 else 0 - if sim.io[in0_ready] == 1 and sim.io[in0_valid] == 1: - inputs = inputs[1:] - - if inp2 is not None: - sim.io[in1_valid] = 1 if len(inp2) > 0 else 0 - sim.io[in1_data] = inp2[0] if len(inp2) > 0 else 0 - if sim.io[in1_ready] == 1 and sim.io[in1_valid] == 1: - inp2 = inp2[1:] - - if sim.io[o_valid] == 1 and sim.io[o_ready] == 1: - outputs = outputs + [sim.io[o_data]] - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 - - observation_count = observation_count + 1 - no_change_count = no_change_count + 1 - - if len(outputs) == num_out_values: - self.set_nodeattr("cycles_rtlsim", observation_count) - output_observed = True - - if no_change_count == liveness_threshold: - if old_outputs == outputs: - if trace_file != "": - sim.flush_vcd_trace() - sim.stop_vcd_trace() - raise Exception( - "Error in simulation! Takes too long to produce output. " - "Consider setting the LIVENESS_THRESHOLD env.var. to a " - "larger value." - ) - else: - no_change_count = 0 - old_outputs = outputs - if trace_file != "": - sim.flush_vcd_trace() - sim.stop_vcd_trace() - return outputs - - def rtlsim_multi_io(self, sim, io_dict): - "Run rtlsim for this node, supports multiple i/o streams." - - # signal name - sname = "_" + self.hls_sname() + "_" - - trace_file = self.get_nodeattr("rtlsim_trace") - if trace_file == "default": - trace_file = self.onnx_node.name + ".vcd" - num_out_values = self.get_number_output_values() - total_cycle_count = rtlsim_multi_io( - sim, - io_dict, - num_out_values, - trace_file=trace_file, - sname=sname, - liveness_threshold=pyverilate_get_liveness_threshold_cycles(), - ) - self.set_nodeattr("cycles_rtlsim", total_cycle_count) - - def execute_node(self, context, graph): - """Executes single node using cppsim or rtlsim.""" - mode = self.get_nodeattr("exec_mode") - if mode == "cppsim": - # save input(s) - self.dynamic_input_to_npy(context, 1) - # execute the precompiled model - self.exec_precompiled_singlenode_model() - # load output npy file - self.npy_to_dynamic_output(context) - elif mode == "rtlsim": - pass - - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def generate_params(self, model, path): - """Function to generate parameters (i.e. weights and thresholds), - is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def get_number_output_values(self): - """Function to get the number of expected output values, - is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def global_includes(self): - """Function to set the global includes for c++ code that has to be generated - for cppsim or rtlsim, is member function of HLSCustomOp class but has to - be filled by every node.""" - pass - - @abstractmethod - def defines(self, var): - """Function to set the define commands for c++ code that has to be generated - for cppsim or rtlsim, is member function of HLSCustomOp class but has to - be filled by every node. - - var: makes it possible to reuse the function for different c++ code generation. - I.e. if set to "ipgen" in MatrixVectorActivation additional PRAGMA defines are - added.""" - pass - - @abstractmethod - def read_npy_data(self): - """Function to generate the commands for reading data from .npy file in c++, - is member function of HLSCustomOp class but has to be filled by every node.""" - pass - - @abstractmethod - def strm_decl(self): - """Function to generate the commands for the stream declaration in c++, - is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def docompute(self): - """Function to generate the commands for the computational part of the - c++ code, is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def dataoutstrm(self): - """Function to generate the commands for reading out data from c++ and convert - into npy format, is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def save_as_npy(self): - """Function to generate the commands for saving data in .npy file in c++, - is member function of HLSCustomOp class but has to be filled by every node.""" - pass - - @abstractmethod - def blackboxfunction(self): - """Function to generate a blackbock function in c++ from which an IP block - will be generated, is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def pragmas(self): - """Function to generate the pragma commands in c++, is member function of - HLSCustomOp class but has to be filled by every node.""" - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input stream ind.""" - raise Exception("get_input_datatype not implemented for this op") - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output stream ind.""" - raise Exception("get_output_datatype not implemented for this op") - - def get_normal_input_shape(self, ind=0): - """Returns normal input shape if implemented.""" - raise Exception("get_normal_input_shape not implemented for this op") - - def get_normal_output_shape(self, ind=0): - """Returns folded output shape if implemented.""" - raise Exception("get_normal_output_shape not implemented for this op") - - def get_folded_input_shape(self, ind=0): - """Returns folded input shape (according to synapse folding), if implemented.""" - raise Exception("get_folded_input_shape not implemented for this op") - - def get_folded_output_shape(self, ind=0): - """Returns folded output shape (according to neuron folding), if implemented.""" - raise Exception("get_folded_output_shape not implemented for this op") - - def get_instream_width(self, ind=0): - """Returns input stream width, if implemented.""" - raise Exception("get_instream_width not implemented for this op") - - def get_outstream_width(self, ind=0): - """Returns output stream width, if implemented.""" - raise Exception("get_outstream_width not implemented for this op") - - def get_instream_width_padded(self, ind=0): - """Returns input stream width padded to a multiple of 8. This is required - by the AXI Stream spec.""" - in_width = self.get_instream_width(ind=ind) - return roundup_to_integer_multiple(in_width, 8) - - def get_outstream_width_padded(self, ind=0): - """Returns output stream width padded to a multiple of 8. This is required - by the AXI Stream spec.""" - out_width = self.get_outstream_width(ind=ind) - return roundup_to_integer_multiple(out_width, 8) - - def get_ap_int_max_w(self): - """Return the maximum width of any ap_int used in this module. Used to set the - AP_INT_MAX_W definition for HLS.""" - instream = self.get_instream_width() - outstream = self.get_outstream_width() - ret = max([instream, outstream]) - assert ret <= 32768, "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret - return ret - - def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): - """Return the unconstrained characteristic functions for this node.""" - # ensure rtlsim is ready - assert self.get_nodeattr("rtlsim_so") != "", "rtlsim not ready for " + self.onnx_node.name - if self.get_nodeattr("io_chrc_period") > 0: - warnings.warn("Skipping node %s: already has FIFO characteristic" % self.onnx_node.name) - return - exp_cycles = self.get_exp_cycles() - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - n_outs = np.prod(self.get_folded_output_shape()[:-1]) - if exp_cycles == 0: - # try to come up with an optimistic estimate - exp_cycles = min(n_inps, n_outs) - assert ( - exp_cycles <= period - ), "Period %d too short to characterize %s : expects min %d cycles" % ( - period, - self.onnx_node.name, - exp_cycles, - ) - sim = self.get_rtlsim() - # signal name - sname = "_" + self.hls_sname() + "_" - if override_rtlsim_dict is not None: - io_dict = override_rtlsim_dict - else: - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - - # extra dicts to keep track of cycle-by-cycle transaction behavior - # note that we restrict key names to filter out weight streams etc - txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key} - txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out" in key} - - def monitor_txns(sim_obj): - for inp in txns_in: - in_ready = _read_signal(sim, inp + sname + "TREADY") == 1 - in_valid = _read_signal(sim, inp + sname + "TVALID") == 1 - if in_ready and in_valid: - txns_in[inp].append(1) - else: - txns_in[inp].append(0) - for outp in txns_out: - if ( - _read_signal(sim, outp + sname + "TREADY") == 1 - and _read_signal(sim, outp + sname + "TVALID") == 1 - ): - txns_out[outp].append(1) - else: - txns_out[outp].append(0) - - reset_rtlsim(sim) - total_cycle_count = rtlsim_multi_io( - sim, - io_dict, - n_outs, - sname=sname, - liveness_threshold=period, - hook_preclk=monitor_txns, - ) - assert ( - total_cycle_count <= period - ), """Total cycle count from rtl simulation is higher than - specified period, please set the period higher than {}""".format( - total_cycle_count - ) - self.set_nodeattr("io_chrc_period", period) - - def accumulate_char_fxn(chrc): - p = len(chrc) - ret = [] - for t in range(2 * p): - if t == 0: - ret.append(chrc[0]) - else: - ret.append(ret[-1] + chrc[t % p]) - return np.asarray(ret, dtype=np.int32) - - all_txns_in = np.empty((len(txns_in.keys()), 2 * period), dtype=np.int32) - all_txns_out = np.empty((len(txns_out.keys()), 2 * period), dtype=np.int32) - all_pad_in = [] - all_pad_out = [] - for in_idx, in_strm_nm in enumerate(txns_in.keys()): - txn_in = txns_in[in_strm_nm] - if len(txn_in) < period: - pad_in = period - len(txn_in) - txn_in += [0 for x in range(pad_in)] - txn_in = accumulate_char_fxn(txn_in) - all_txns_in[in_idx, :] = txn_in - all_pad_in.append(pad_in) - - for out_idx, out_strm_nm in enumerate(txns_out.keys()): - txn_out = txns_out[out_strm_nm] - if len(txn_out) < period: - pad_out = period - len(txn_out) - txn_out += [0 for x in range(pad_out)] - txn_out = accumulate_char_fxn(txn_out) - all_txns_out[out_idx, :] = txn_out - all_pad_out.append(pad_out) - - self.set_nodeattr("io_chrc_in", all_txns_in) - self.set_nodeattr("io_chrc_out", all_txns_out) - self.set_nodeattr("io_chrc_pads_in", all_pad_in) - self.set_nodeattr("io_chrc_pads_out", all_pad_out) diff --git a/src/finn/transformation/fpgadataflow/compile_cppsim.py b/src/finn/transformation/fpgadataflow/compile_cppsim.py index e93a8ec307..4814b24a92 100644 --- a/src/finn/transformation/fpgadataflow/compile_cppsim.py +++ b/src/finn/transformation/fpgadataflow/compile_cppsim.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,7 +30,7 @@ import qonnx.custom_op.registry as registry from qonnx.transformation.base import NodeLocalTransformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_fpgadataflow_node, is_hls_node class CompileCppSim(NodeLocalTransformation): @@ -50,7 +51,7 @@ def __init__(self, num_workers=None): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node) and is_hls_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py index 08069fa00f..daf64656b5 100644 --- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py +++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +32,7 @@ import warnings from qonnx.transformation.base import NodeLocalTransformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_fpgadataflow_node, is_hls_node class HLSSynthIP(NodeLocalTransformation): @@ -42,7 +43,7 @@ class HLSSynthIP(NodeLocalTransformation): Any nodes that already have a ipgen_path attribute pointing to a valid path will be skipped. - This transformation calls Vivado HLS for synthesis, so it will run for + This transformation calls Vitis HLS for synthesis, so it will run for some time (minutes to hours depending on configuration). * num_workers (int or None) number of parallel workers, see documentation in @@ -54,7 +55,7 @@ def __init__(self, num_workers=None): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node) and is_hls_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py index 76c3f88310..0b744b5f4f 100644 --- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py +++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +35,7 @@ from qonnx.util.basic import get_num_default_workers from finn.util.basic import make_build_dir -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_fpgadataflow_node, is_hls_node def _codegen_single_node(node, model): @@ -78,7 +79,7 @@ def __init__(self, num_workers=None): self._num_workers = mp.cpu_count() def prepareCppSim_node(self, node): - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node) and is_hls_node(node): _codegen_single_node(node, self.model) return (node, False) diff --git a/src/finn/transformation/fpgadataflow/set_exec_mode.py b/src/finn/transformation/fpgadataflow/set_exec_mode.py index 8488b4ef83..7df4451a22 100644 --- a/src/finn/transformation/fpgadataflow/set_exec_mode.py +++ b/src/finn/transformation/fpgadataflow/set_exec_mode.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,12 +30,15 @@ import qonnx.custom_op.registry as registry from qonnx.transformation.base import Transformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_fpgadataflow_node, is_rtl_node class SetExecMode(Transformation): """Set attribute exec_mode in all fpgadataflow nodes to specify which - kind of execution should be used ("cppsim" or "rtlsim")""" + kind of execution should be used ("cppsim" or "rtlsim"). + Note that RTL components do not support cppsim. + If cppsim is selected, only HLS components will be set for cppsim, + RTL components default in this case to rtlsim execution mode.""" def __init__(self, mode): super().__init__() @@ -43,12 +47,16 @@ def __init__(self, mode): def apply(self, model): for node in model.graph.node: op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): + if self.mode == "cppsim" and is_rtl_node(node): + mode = "rtlsim" + else: + mode = self.mode try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) # set sim_mode accordingly to argument mode - inst.set_nodeattr("exec_mode", self.mode) + inst.set_nodeattr("exec_mode", mode) # ensure that sim_mode is now set assert ( inst.get_nodeattr("exec_mode") != "" diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py index 769ddb9465..aae438fac2 100644 --- a/src/finn/util/fpgadataflow.py +++ b/src/finn/util/fpgadataflow.py @@ -41,3 +41,31 @@ def is_fpgadataflow_node(node): is_node = True return is_node + + +def is_hls_node(node): + """Returns True if given node is hls node. Otherwise False.""" + is_node = False + if node is not None: + if node.domain == "finn.custom_op.fpgadataflow.hls": + n_backend = get_by_name(node.attribute, "backend") + if n_backend is not None: + backend_value = n_backend.s.decode("UTF-8") + if backend_value == "fpgadataflow": + is_node = True + + return is_node + + +def is_rtl_node(node): + """Returns True if given node is rtl node. Otherwise False.""" + is_node = False + if node is not None: + if node.domain == "finn.custom_op.fpgadataflow.rtl": + n_backend = get_by_name(node.attribute, "backend") + if n_backend is not None: + backend_value = n_backend.s.decode("UTF-8") + if backend_value == "fpgadataflow": + is_node = True + + return is_node From 94a2ff31d4e7e79c77536e97924b58e58f4c6329 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 7 Feb 2024 15:48:13 +0000 Subject: [PATCH 117/291] [Tests] First cleanup over tests to update to new flow --- tests/fpgadataflow/test_code_gen_trafo.py | 7 +- tests/fpgadataflow/test_compilation_trafo.py | 7 +- .../test_convert_to_hls_channelwise_layer.py | 134 ------------------ ...py => test_convert_to_hw_1d_conv_layer.py} | 28 ++-- ... test_convert_to_hw_conv_fc_transition.py} | 25 ++-- tests/fpgadataflow/test_fpgadataflow_vvau.py | 2 +- tests/fpgadataflow/test_runtime_weights.py | 8 +- 7 files changed, 51 insertions(+), 160 deletions(-) delete mode 100644 tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py rename tests/fpgadataflow/{test_convert_to_hls_1d_conv_layer.py => test_convert_to_hw_1d_conv_layer.py} (88%) rename tests/fpgadataflow/{test_convert_to_hls_conv_fc_transition.py => test_convert_to_hw_conv_fc_transition.py} (90%) diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py index f5edabbd4b..709333949e 100644 --- a/tests/fpgadataflow/test_code_gen_trafo.py +++ b/tests/fpgadataflow/test_code_gen_trafo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -50,10 +51,10 @@ def test_code_gen_trafo(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, mh]) node_inp_list = ["inp", "weights", "thresh"] FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MatrixVectorActivation_hls", node_inp_list, ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", code_gen_dir="", executable_path="", diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py index d04b68a56b..1b48df3d4a 100644 --- a/tests/fpgadataflow/test_compilation_trafo.py +++ b/tests/fpgadataflow/test_compilation_trafo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -51,10 +52,10 @@ def test_compilation_trafo(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, mh]) node_inp_list = ["inp", "weights", "thresh"] FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MatrixVectorActivation_hls", node_inp_list, ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", code_gen_dir="", executable_path="", diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py deleted file mode 100644 index bb2c1d74c2..0000000000 --- a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -import numpy as np -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.transformation.general import GiveUniqueNodeNames -from qonnx.transformation.infer_data_layouts import InferDataLayouts -from qonnx.transformation.infer_shapes import InferShapes -from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model - -import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls -from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode - - -def prepare_inputs(input_tensor): - return {"inp": input_tensor} - - -def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape): - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape) - outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, ishape) - p0 = helper.make_tensor_value_info("p0", TensorProto.FLOAT, pshape) - - model = qonnx_make_model( - helper.make_graph( - name="test", - inputs=[inp], - outputs=[outp], - value_info=[p0], - nodes=[helper.make_node(onnx_op_name, ["inp", "p0"], ["outp"])], - ) - ) - - model = ModelWrapper(model) - model.set_initializer("p0", gen_finn_dt_tensor(pdt, pshape)) - model.set_tensor_datatype("inp", idt) - model.transform(InferDataLayouts(), make_deepcopy=False) - model.transform(InferShapes(), make_deepcopy=False) - return model - - -# parameter datatype -@pytest.mark.parametrize("pdt", [DataType["BIPOLAR"], DataType["UINT4"], DataType["INT2"]]) -# input datatype -@pytest.mark.parametrize("idt", [DataType["INT32"], DataType["UINT4"], DataType["INT4"]]) -# function -@pytest.mark.parametrize("onnx_op_name", ["Add", "Mul"]) -# vector parameter or scalar parameter (broadcast) -@pytest.mark.parametrize("scalar_param", [True, False]) -# execution mode -@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) -@pytest.mark.fpgadataflow -@pytest.mark.vivado -@pytest.mark.slow -def test_convert_to_hls_channelwise_layer(pdt, idt, onnx_op_name, scalar_param, exec_mode): - ifm_ch = 16 - ifm_dim = 5 - ishape = (1, ifm_ch, ifm_dim, ifm_dim) - if scalar_param: - pshape = (1,) - else: - pshape = (1, ifm_ch, 1, 1) - - np.random.seed(0) - model = make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape) - - # Since the aren't Data types with a bit width of a non power of 2, - # there are cases where the input won't use it full range. - if idt == DataType["INT32"]: - x = gen_finn_dt_tensor(DataType["INT16"], (1, ifm_ch, ifm_dim, ifm_dim)) - elif idt == DataType["UINT32"]: - x = gen_finn_dt_tensor(DataType["UINT16"], (1, ifm_ch, ifm_dim, ifm_dim)) - else: - x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim)) - - input_dict = prepare_inputs(x) - y_expected = oxe.execute_onnx(model, input_dict)["outp"] - - new_model = model.transform(to_hls.InferChannelwiseLinearLayer()) - new_model = new_model.transform(GiveUniqueNodeNames()) - - if exec_mode == "cppsim": - new_model = new_model.transform(PrepareCppSim()) - new_model = new_model.transform(CompileCppSim()) - new_model = new_model.transform(SetExecMode("cppsim")) - elif exec_mode == "rtlsim": - new_model = new_model.transform(SetExecMode("rtlsim")) - new_model = new_model.transform(GiveUniqueNodeNames()) - new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) - new_model = new_model.transform(HLSSynthIP()) - new_model = new_model.transform(PrepareRTLSim()) - else: - raise Exception("Unknown exec_mode") - - ctx_produced = oxe.execute_onnx(new_model, input_dict, return_full_exec_context=True) - y_produced = ctx_produced["outp"] - - assert (y_produced == y_expected).all() - assert new_model.graph.node[1].op_type == "ChannelwiseOp_Batch" diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py similarity index 88% rename from tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py rename to tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py index 2af0957e12..32ec229334 100644 --- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -41,7 +42,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -49,6 +50,8 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.fpgadataflow import is_fpgadataflow_node # conv_config: @@ -86,7 +89,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ pad_w = pad[1] + pad[3] if use_rtl_swg and exec_mode == "cppsim": - pytest.skip("cppsim not supported for RTL SWG") + pytest.skip("Skip cppsim if SWG is in rtl") if depthwise is True: group = out_chn = in_chn @@ -135,12 +138,19 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) + new_model = new_model.transform(to_hw.InferConvInpGen()) + if not use_rtl_swg: + for node in new_model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") if depthwise is True: - new_model = new_model.transform(to_hls.InferVectorVectorActivation()) + new_model = new_model.transform(to_hw.InferVectorVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) else: - new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation")[0] + new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) + fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation_hls")[0] fc_inst = getCustomOp(fc_node) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") @@ -171,12 +181,12 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ assert oxe.compare_execution(model, new_model, inp_dict) if pad_h == 1 and pad_w == 1: - padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0] + padding_node = new_model.get_nodes_by_op_type("FMPadding_rtl")[0] padding_inst = getCustomOp(padding_node) assert padding_inst.get_nodeattr("SIMD") == in_chn if depthwise is True and exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("VectorVectorActivation")[0] + node = new_model.get_nodes_by_op_type("VectorVectorActivation_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py similarity index 90% rename from tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py rename to tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py index 94007bdd14..59d65c820d 100755 --- a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,6 +35,7 @@ from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames, RemoveUnusedTensors from qonnx.transformation.infer_data_layouts import InferDataLayouts from qonnx.transformation.infer_datatypes import InferDataTypes @@ -42,14 +44,16 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.move_reshape import RemoveCNVtoFCFlatten from finn.transformation.streamline import Streamline from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants +from finn.util.fpgadataflow import is_fpgadataflow_node def get_multithreshold_rand_params(channels, num_of_thres, seed=None): @@ -187,15 +191,20 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape): new_model = new_model.transform(InferDataLayouts()) new_model = new_model.transform(RemoveUnusedTensors()) - # convert_to_hls + # convert_to_hw if depthwise is True: - new_model = new_model.transform(to_hls.InferVectorVectorActivation()) - new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - new_model = new_model.transform(to_hls.InferThresholdingLayer()) - new_model = new_model.transform(to_hls.InferConvInpGen()) - new_model = new_model.transform(to_hls.InferStreamingMaxPool()) + new_model = new_model.transform(to_hw.InferVectorVectorActivation()) + new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + new_model = new_model.transform(to_hw.InferThresholdingLayer()) + new_model = new_model.transform(to_hw.InferConvInpGen()) + new_model = new_model.transform(to_hw.InferStreamingMaxPool()) new_model = new_model.transform(RemoveCNVtoFCFlatten()) new_model = new_model.transform(absorb.AbsorbConsecutiveTransposes()) + for node in new_model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") + new_model = new_model.transform(SpecializeLayers()) new_model = new_model.transform(GiveUniqueNodeNames()) new_model = new_model.transform(InferDataLayouts()) diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index 1cb64dda91..d4fef6952d 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -278,7 +278,7 @@ def test_fpgadataflow_vvau( assert (y_produced == y_expected).all(), "VVAU specialized-op mismatches with golden output!" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("VectorVectorActivation")[0] + node = model.get_nodes_by_op_type("VectorVectorActivation_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py index 9b2f418776..0f0d88dd35 100644 --- a/tests/fpgadataflow/test_runtime_weights.py +++ b/tests/fpgadataflow/test_runtime_weights.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -41,6 +42,7 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.create import hls_random_mlp_maker test_fpga_part = "xczu3eg-sbva484-1-e" @@ -68,7 +70,8 @@ def test_runtime_weights_single_layer(): } layer_spec_list = [layer_spec] model = hls_random_mlp_maker(layer_spec_list) - fcl = model.get_nodes_by_op_type("MatrixVectorActivation")[0] + model = model.transform(SpecializeLayers()) + fcl = model.get_nodes_by_op_type("MatrixVectorActivation_hls")[0] op_inst = getCustomOp(fcl) op_inst.set_nodeattr("mem_mode", "decoupled") op_inst.set_nodeattr("runtime_writeable_weights", 1) @@ -80,6 +83,7 @@ def test_runtime_weights_single_layer(): old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) From d24ef6358841355e00e0c2e2b9979acbf463cc85 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 8 Feb 2024 16:01:50 +0000 Subject: [PATCH 118/291] [CustomOp] Thresholding Generate Param --- .../fpgadataflow/rtl/thresholding_rtl.py | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 48aeb0b9f8..714930b73d 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -31,6 +31,7 @@ import os import shutil import warnings +from math import ceil, log2 from pyverilator.util.axi_utils import rtlsim_multi_io, reset_rtlsim from qonnx.core.datatype import DataType from qonnx.util.basic import ( @@ -705,4 +706,106 @@ def get_dynamic_config(self, model, address_stride=1): return config + def generate_params(self, model, path): + code_gen_dir = path + thresholds = model.get_initializer(self.onnx_node.input[1]) + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "const": + # save thresholds in thresh.h + weight_filename = "{}/thresh.h".format(code_gen_dir) + self.make_weight_file(thresholds, "hls_header", weight_filename) + elif mem_mode == "decoupled": + # save decoupled weights for cppsim + weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) + self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim) + # also save weights as Verilog .dat file + # This file will be ignored when synthesizing UltraScale memory. + weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) + self.make_weight_file(thresholds, "decoupled_verilog_dat", weight_filename_rtl) + else: + raise Exception( + """Please set mem_mode to "const", "decoupled", + currently no other parameter value is supported!""" + ) + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights (thresholds) in appropriate + format for this layer. This file can be used for either synthesis or + run-time reconfig of weights. + + Arguments: + + * weights : numpy array with weights to be put into the file + * weight_file_mode : one of { decoupled_verilog_dat, + decoupled_runtime} + * weight_file_name : filename for the weight file to be generated + + """ + threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) + tdt = self.get_weight_datatype() + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + if "decoupled" in weight_file_mode: + # streaming thresholds need to be organized differently + # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps) + decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3)) + # TODO add flips/reversals as needed here + # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps) + pe = self.get_nodeattr("PE") + n_thres_steps = self.get_nodeattr("numSteps") + decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2) + decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps) + decoupled_thres = decoupled_thres.copy() + decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape( + 1, -1, pe * n_thres_steps + ) + decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy() + width_padded = roundup_to_integer_multiple(pe * n_thres_steps, 4) + + # zero pad the columns + thres_padded = np.zeros((1, self.calc_tmem() ,width_padded)) + thres_padded[0, :self.calc_tmem(), :(pe * n_thres_steps) ] = decoupled_thres_pe_flipped + decoupled_thres_pe_flipped = thres_padded.copy() + weight_tensor_pe_flipped = [] + if weight_file_mode == "decoupled_npy": + # save weight stream into npy for cppsim + np.save(weight_file_name, decoupled_thres) + elif weight_file_mode == "decoupled_verilog_dat": + # convert weight values into hexstring + weight_width = self.get_weightstream_width() + # pad to nearest 4 bits to get hex strings + weight_width_padded = roundup_to_integer_multiple(weight_width, 4) + weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( + decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" + ) + weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_stream.copy() + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + elif weight_file_mode == "decoupled_runtime": + # memstream axi-lite interface will map each mem line to + # one or multiple 32-bit words + weight_width = self.get_weightstream_width() + words_per_memwidth = 2 ** ceil(log2(weight_width / 32)) + if words_per_memwidth < 1: + words_per_memwidth = 1 + weight_width_padded = words_per_memwidth * 32 # convert to bits + # first, pack and ensure padding to 32 bits + for channel in decoupled_thres_pe_flipped[0]: + for weight in channel: + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32) + weight_tensor_pe_flipped.append(pack_innermost_dim_as_hex_string( + [weight], wdt, bw_hexdigit, prefix="" + ).item()) + weight_stream = weight_tensor_pe_flipped.copy() + + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + else: + raise Exception("Decoupled weight export not yet implemented") + else: + raise Exception("Unknown weight_file_mode") From c4b7b4b9356a6ab19574e4603f7e9fa17d706433 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 8 Feb 2024 17:09:54 +0000 Subject: [PATCH 119/291] [Tests/transforms] Cleanup tests and transforms for new flow --- .../fpgadataflow/hls_synth_res_estimation.py | 7 +- .../fpgadataflow/derive_characteristic.py | 5 +- .../fpgadataflow/make_zynq_proj.py | 6 +- .../fpgadataflow/set_fifo_depths.py | 7 +- .../fpgadataflow/vitis_build.py | 7 +- tests/end2end/test_end2end_bnn_pynq.py | 114 ++++++--- tests/end2end/test_end2end_mobilenet_v1.py | 64 +++-- .../test_convert_to_hls_layers_synthetic.py | 222 ------------------ .../test_convert_to_hw_1d_conv_layer.py | 2 +- .../test_convert_to_hw_conv_fc_transition.py | 2 +- ...er.py => test_convert_to_hw_conv_layer.py} | 34 ++- ...nv.py => test_convert_to_hw_layers_cnv.py} | 37 +-- ..._fc.py => test_convert_to_hw_layers_fc.py} | 38 +-- .../test_convert_to_hw_layers_synthetic.py | 8 +- .../test_convert_to_hw_thresholding.py | 2 +- .../test_depthwise_convolution.py | 27 ++- ...dataflow_convinputgenerator_rtl_dynamic.py | 16 +- 17 files changed, 251 insertions(+), 347 deletions(-) delete mode 100644 tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py rename tests/fpgadataflow/{test_convert_to_hls_conv_layer.py => test_convert_to_hw_conv_layer.py} (86%) rename tests/fpgadataflow/{test_convert_to_hls_layers_cnv.py => test_convert_to_hw_layers_cnv.py} (84%) rename tests/fpgadataflow/{test_convert_to_hls_layers_fc.py => test_convert_to_hw_layers_fc.py} (88%) diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py index 4d921438f6..cd6b322727 100644 --- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py +++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py @@ -30,11 +30,12 @@ import warnings import xml.etree.ElementTree as ET -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_fpgadataflow_node, is_hls_node def hls_synth_res_estimation(model): - """Extracts the FPGA resource results from the Vivado HLS synthesis estimates. + """Extracts the FPGA resource results from the Vitis HLS synthesis estimates. + Note that this analysis pass only works on nodes that have an HLS backend. Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames transformation) prior to calling this analysis pass to ensure all nodes are visible in the results. @@ -43,7 +44,7 @@ def hls_synth_res_estimation(model): res_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node) and is_hls_node(node): # init values to zero res_dict[node.name] = dict() res_dict[node.name]["BRAM_18K"] = 0 diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py index dc660f5fba..d5699e4dc6 100644 --- a/src/finn/transformation/fpgadataflow/derive_characteristic.py +++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py @@ -1,4 +1,5 @@ -# Copyright (c) 2022, Xilinx +# Copyright (C) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -134,7 +135,7 @@ def applyNodeLocal(self, node): try: # lookup op_type in registry of CustomOps prod = registry.getCustomOp(node) - assert op_type != "StreamingFIFO", "Found existing FIFOs" + assert not (op_type.startswith("StreamingFIFO")), "Found existing FIFOs" period = prod.get_nodeattr("io_chrc_period") prod_chrc = prod.get_nodeattr("io_chrc_out")[0] assert len(prod_chrc) == 2 * period, "Found unexpected characterization attribute" diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 2f58064f11..65095f1de7 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,6 +46,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map from . import templates @@ -322,6 +324,7 @@ def apply(self, model): prep_transforms = [ InsertIODMA(self.axi_port_width), InsertDWC(), + SpecializeLayers(), Floorplan(), CreateDataflowPartition(partition_model_dir=self.partition_model_dir), ] @@ -337,6 +340,7 @@ def apply(self, model): dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) + kernel_model = kernel_model.transform(SpecializeLayers()) kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) kernel_model.save(dataflow_model_filename) kernel_model = kernel_model.transform(PrepareIP(self.fpga_part, self.period_ns)) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index f2aefc25dd..75c35df7d7 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -47,6 +48,7 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.pyverilator import pyverilate_stitched_ip, verilator_fifosim @@ -294,12 +296,13 @@ def apply(self, model): # insert stream infrastructure (DWC/FIFO) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) # gather FIFO names, check they are of expected depth fifos = {} - fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO") + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO_rtl") for node in fifo_nodes: fifos[node.name] = 0 node = getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py index a102660001..da7624b8ff 100644 --- a/src/finn/transformation/fpgadataflow/vitis_build.py +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -49,6 +50,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import make_build_dir from . import templates @@ -381,7 +383,7 @@ def __init__( def apply(self, model): _check_vitis_envvars() # prepare at global level, then break up into kernels - prep_transforms = [InsertIODMA(512), InsertDWC()] + prep_transforms = [InsertIODMA(512), InsertDWC(), SpecializeLayers()] for trn in prep_transforms: model = model.transform(trn) model = model.transform(GiveUniqueNodeNames()) @@ -403,6 +405,7 @@ def apply(self, model): dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) + kernel_model = kernel_model.transform(SpecializeLayers()) kernel_model = kernel_model.transform(RemoveUnusedTensors()) kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) kernel_model.save(dataflow_model_filename) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index b296dad827..53e5bb85eb 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -60,7 +61,7 @@ from qonnx.util.cleanup import cleanup as qonnx_cleanup from shutil import copy -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.core.onnx_exec import execute_onnx @@ -85,6 +86,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.move_reshape import RemoveCNVtoFCFlatten from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline @@ -93,6 +95,7 @@ MoveScalarLinearPastInvariants, ) from finn.util.basic import get_finn_root, make_build_dir, test_board_map +from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.pytorch import ToTensor from finn.util.test import ( execute_parent, @@ -119,7 +122,7 @@ def get_checkpoint_name(topology, wbits, abits, step): def fold_tfc(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation_hls") # (PE, SIMD, ramstyle) for each layer config = [(16, 49, "block"), (8, 8, "auto"), (8, 8, "auto"), (10, 8, "distributed")] for fcl, (pe, simd, ramstyle) in zip(fc_layers, config): @@ -128,7 +131,7 @@ def fold_tfc(model): fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) # set parallelism for input quantizer to be same as first layer's SIMD - inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0] + inp_qnt_node = model.get_nodes_by_op_type("Thresholding_hls")[0] inp_qnt = getCustomOp(inp_qnt_node) inp_qnt.set_nodeattr("PE", 49) inp_qnt.set_nodeattr("mem_mode", "decoupled") @@ -137,7 +140,7 @@ def fold_tfc(model): def fold_lfc(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation_hls") # (PE, SIMD, ramstyle) for each layer config = [ (32, 49, "block"), @@ -152,14 +155,14 @@ def fold_lfc(model): fcl_inst.set_nodeattr("ram_style", ramstyle) fcl_inst.set_nodeattr("runtime_writeable_weights", 1) # set parallelism for input quantizer to be same as first layer's SIMD - inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0] + inp_qnt_node = model.get_nodes_by_op_type("Thresholding_hls")[0] inp_qnt = getCustomOp(inp_qnt_node) inp_qnt.set_nodeattr("PE", 49) return model def fold_cnv_large(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation_hls") # each tuple is (PE, SIMD) for a layer folding = [ (16, 3), @@ -177,7 +180,7 @@ def fold_cnv_large(model): fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) - swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] @@ -186,7 +189,7 @@ def fold_cnv_large(model): def fold_cnv_small(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation_hls") # each tuple is (PE, SIMD) for a layer folding = [ (8, 3, "distributed"), @@ -205,7 +208,7 @@ def fold_cnv_small(model): fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) - swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] @@ -529,56 +532,103 @@ def test_streamline(self, topology, wbits, abits, board): model = model.transform(RemoveUnusedTensors()) model.save(get_checkpoint_name(topology, wbits, abits, "streamline")) - def test_convert_to_hls_layers(self, topology, wbits, abits, board): + def test_convert_to_hw_layers(self, topology, wbits, abits, board): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "streamline") model = load_test_checkpoint_or_skip(prev_chkpt_name) if topology == "tfc" and wbits == 1 and abits == 1: # use standalone thresholds for tfc-w1a1 to also exercise that option - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for bipolar MatMul layers - model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation(mem_mode)) # needed for non-bipolar MatMul layers - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation(mem_mode)) # TopK to LabelSelect - model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) # input quantization (if any) to standalone thresholding - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for convolutions if "fc" not in topology: - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferStreamingMaxPool()) model = model.transform(RemoveCNVtoFCFlatten()) # get rid of Tranpose -> Tranpose identity seq model = model.transform(absorb.AbsorbConsecutiveTransposes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(InferDataLayouts()) - model.save(get_checkpoint_name(topology, wbits, abits, "convert_to_hls_layers")) + model.save(get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers")) exp_layer_counts = { "tfc": [ ("Reshape", 1), - ("Thresholding_Batch", 1), + ("Thresholding", 1), ("MatrixVectorActivation", 4), - ("LabelSelect_Batch", 1), + ("LabelSelect", 1), ], "tfc-1-1": [ ("Reshape", 1), - ("Thresholding_Batch", 4), + ("Thresholding", 4), ("MatrixVectorActivation", 4), - ("LabelSelect_Batch", 1), + ("LabelSelect", 1), ], "lfc": [ ("Reshape", 1), - ("Thresholding_Batch", 1), + ("Thresholding", 1), ("MatrixVectorActivation", 4), - ("LabelSelect_Batch", 1), + ("LabelSelect", 1), ], "cnv": [ ("Transpose", 1), - ("Thresholding_Batch", 1), + ("Thresholding", 1), ("ConvolutionInputGenerator", 6), ("MatrixVectorActivation", 9), - ("StreamingMaxPool_Batch", 2), - ("LabelSelect_Batch", 1), + ("StreamingMaxPool", 2), + ("LabelSelect", 1), + ], + } + if topology == "tfc" and wbits == 1 and abits == 1: + exp_key = "tfc-1-1" + else: + exp_key = topology + exp_layer_counts = exp_layer_counts[exp_key] + for op_type, exp_count in exp_layer_counts: + assert len(model.get_nodes_by_op_type(op_type)) == exp_count + + def test_specialize_layers(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers") + model = load_test_checkpoint_or_skip(prev_chkpt_name) + # set preferred impl style to hls for all layers + for node in model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + model.save(get_checkpoint_name(topology, wbits, abits, "specialize_layers")) + exp_layer_counts = { + "tfc": [ + ("Reshape", 1), + ("Thresholding_hls", 1), + ("MatrixVectorActivation_hls", 4), + ("LabelSelect_hls", 1), + ], + "tfc-1-1": [ + ("Reshape", 1), + ("Thresholding_hls", 4), + ("MatrixVectorActivation_hls", 4), + ("LabelSelect_hls", 1), + ], + "lfc": [ + ("Reshape", 1), + ("Thresholding_hls", 1), + ("MatrixVectorActivation_hls", 4), + ("LabelSelect_hls", 1), + ], + "cnv": [ + ("Transpose", 1), + ("Thresholding_hls", 1), + ("ConvolutionInputGenerator_rtl", 6), + ("MatrixVectorActivation_hls", 9), + ("StreamingMaxPool_hls", 2), + ("LabelSelect_hls", 1), ], } if topology == "tfc" and wbits == 1 and abits == 1: @@ -590,7 +640,7 @@ def test_convert_to_hls_layers(self, topology, wbits, abits, board): assert len(model.get_nodes_by_op_type(op_type)) == exp_count def test_create_dataflow_partition(self, topology, wbits, abits, board): - prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hls_layers") + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "specialize_layers") model = load_test_checkpoint_or_skip(prev_chkpt_name) parent_model = model.transform(CreateDataflowPartition()) parent_model_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent") @@ -656,6 +706,9 @@ def test_set_fifo_depths(self, topology, wbits, abits, board): model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns)) fifo_layers = model.get_nodes_by_op_type("StreamingFIFO") assert len(fifo_layers) > 0 + model = model.transform(SpecializeLayers()) + fifo_layers = model.get_nodes_by_op_type("StreamingFIFO_rtl") + assert len(fifo_layers) > 0 model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board)) @pytest.mark.slow @@ -665,12 +718,13 @@ def test_ipstitch_rtlsim(self, topology, wbits, abits, board): model = load_test_checkpoint_or_skip(prev_chkpt_name) test_fpga_part = get_build_env(board, target_clk_ns)["part"] model = model.transform(InsertDWC()) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(AnnotateCycles()) perf = model.analysis(dataflow_performance) latency = perf["critical_path_cycles"] # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that - for fifo_layer in model.get_nodes_by_op_type("StreamingFIFO"): + for fifo_layer in model.get_nodes_by_op_type("StreamingFIFO_rtl"): getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl") model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index 512558eb09..ba52548290 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -54,7 +55,7 @@ from qonnx.transformation.remove import RemoveIdentityOps from qonnx.util.cleanup import cleanup as qonnx_cleanup -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb import finn.transformation.streamline.reorder as reorder from finn.core.onnx_exec import execute_onnx @@ -62,13 +63,21 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import ( CreateDataflowPartition, ) +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds from finn.util.basic import alveo_default_platform, alveo_part_map, get_finn_root +from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.pytorch import NormalizePreProc from finn.util.test import ( crop_center, @@ -212,29 +221,42 @@ def test_end2end_mobilenet_lowering(): @pytest.mark.end2end @pytest.mark.xfail -def test_end2end_mobilenet_convert_to_hls_layers(): +def test_end2end_mobilenet_convert_to_hw_layers(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_lowered.onnx") - model = model.transform(to_hls.InferPool_Batch()) - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferVectorVectorActivation()) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode)) - model = model.transform(to_hls.InferChannelwiseLinearLayer()) - model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hw.InferPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) model = model.transform(InferShapes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) - model.save(build_dir + "/end2end_mobilenet_hls_layers.onnx") + model.save(build_dir + "/end2end_mobilenet_hw_layers.onnx") + + +@pytest.mark.end2end +def test_end2end_mobilenet_specialize_layers(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hw_layers.onnx") + for node in model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model.save(build_dir + "/end2end_mobilenet_specialize_layers.onnx") @pytest.mark.end2end def test_end2end_mobilenet_folding(): - model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hls_layers.onnx") + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_specialize_layers.onnx") # optional extra folding to use fewer resources # applied while setting the attributes on each node assert extra_fold in [1, 2, 4] # set up folding for the depthwise conv layers impl'd by VVAUs # each value is PE for a layer - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation_hls") # each tuple is (PE, SIMD, ram_style) for a layer folding = [ (32, 3, "block"), @@ -263,7 +285,7 @@ def test_end2end_mobilenet_folding(): getCustomOp(fc_layers[0]).set_nodeattr("resType", first_layer_res_type) # set up folding for the depthwise conv layers impl'd by VVAUs # each value is PE for a layer - vvau_layers = model.get_nodes_by_op_type("VectorVectorActivation") + vvau_layers = model.get_nodes_by_op_type("VectorVectorActivation_hls") folding = [32, 32, 64, 16, 32, 8, 16, 16, 16, 16, 16, 4, 8] for vvau, pe in zip(vvau_layers, folding): vvau_inst = getCustomOp(vvau) @@ -274,11 +296,11 @@ def test_end2end_mobilenet_folding(): convinputgen_inst.set_nodeattr("SIMD", pe // extra_fold) # set SIMD in preceeding FMPadding to same value padding = model.find_direct_predecessors(convinputgen)[0] - if padding.op_type == "FMPadding_Batch": + if padding.op_type == "FMPadding_hls": padding_inst = getCustomOp(padding) padding_inst.set_nodeattr("SIMD", pe // extra_fold) # adjust final pooling layer + its inpgen - pool_node = model.get_nodes_by_op_type("Pool_Batch")[0] + pool_node = model.get_nodes_by_op_type("Pool_hls")[0] pool_inst = getCustomOp(pool_node) pool_inst.set_nodeattr("PE", 4 // extra_fold) pool_inpgen = model.find_direct_predecessors(pool_node)[0] @@ -289,8 +311,16 @@ def test_end2end_mobilenet_folding(): @pytest.mark.end2end -def test_end2end_mobilenet_create_dataflow_partition(): +def test_end2end_mobilenet_minimize_bit_width(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx") + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(MinimizeWeightBitWidth()) + model = model.save(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx") + + +@pytest.mark.end2end +def test_end2end_mobilenet_create_dataflow_partition(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx") parent_model = model.transform(CreateDataflowPartition()) parent_model.save(build_dir + "/end2end_mobilenet_dataflow_parent.onnx") sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] @@ -306,7 +336,7 @@ def test_end2end_mobilenet_create_dataflow_partition(): @pytest.mark.end2end @pytest.mark.xfail def test_end2end_mobilenet_cppsim(): - model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx") + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx") x = np.load(build_dir + "/end2end_mobilenet_input.npy") inp_name = model.graph.input[0].name out_name = model.graph.output[0].name diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py deleted file mode 100644 index f8e566156b..0000000000 --- a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -import numpy as np -import os -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.transformation.fold_constants import FoldConstants -from qonnx.transformation.general import ( - GiveReadableTensorNames, - GiveUniqueNodeNames, - SortGraph, -) -from qonnx.transformation.infer_data_layouts import InferDataLayouts -from qonnx.transformation.infer_datatypes import InferDataTypes -from qonnx.transformation.infer_shapes import InferShapes -from qonnx.transformation.insert_topk import InsertTopK -from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model - -import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls -from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.streamline.absorb import ( - AbsorbConsecutiveTransposes, - AbsorbScalarMulAddIntoTopK, -) -from finn.transformation.streamline.collapse_repeated import ( - CollapseRepeatedAdd, - CollapseRepeatedMul, -) -from finn.transformation.streamline.reorder import ( - MoveAddPastMul, - MoveScalarLinearPastInvariants, -) -from finn.util.test import soft_verify_topk - -export_onnx_path = "test_output_synthetic.onnx" - -# construct a synthetic graph to test: -# topk insertion, topk conversion to hls, add conversion to hls -# graph should just be a sum - - -def make_model(ch, ifmdim): - shape = [1, ch, ifmdim, ifmdim] - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) - inp1_add0_ct = helper.make_tensor_value_info("inp1_add0_ct", TensorProto.FLOAT, [1]) - inp1_add = helper.make_tensor_value_info("inp1_add", TensorProto.FLOAT, shape) - inp1_add_ct = helper.make_tensor_value_info("inp1_add_ct", TensorProto.FLOAT, [1]) - inp2_add = helper.make_tensor_value_info("inp2_add", TensorProto.FLOAT, shape) - inp2_add_ct = helper.make_tensor_value_info("inp2_add_ct", TensorProto.FLOAT, [1]) - inp1_mul = helper.make_tensor_value_info("inp1_mul", TensorProto.FLOAT, shape) - inp1_mul_ct = helper.make_tensor_value_info("inp1_mul_ct", TensorProto.FLOAT, [1]) - inp2_mul = helper.make_tensor_value_info("inp2_mul", TensorProto.FLOAT, shape) - inp2_mul_ct = helper.make_tensor_value_info("inp2_mul_ct", TensorProto.FLOAT, [1]) - eltwise_add = helper.make_tensor_value_info("eltwise_add", TensorProto.FLOAT, shape) - pool = helper.make_tensor_value_info("pool", TensorProto.FLOAT, [1, ch, 1, 1]) - reshape_ct = helper.make_tensor_value_info("reshape_ct", TensorProto.INT64, [2]) - outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch]) - - add0_node = helper.make_node("Add", [inp.name, inp1_add0_ct.name], ["out_add0"]) - add1_node = helper.make_node("Add", ["out_add0", inp1_add_ct.name], [inp1_add.name]) - add2_node = helper.make_node("Add", ["out_add0", inp2_add_ct.name], [inp2_add.name]) - mul1_node = helper.make_node("Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name]) - mul2_node = helper.make_node("Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name]) - eltwise_add_node = helper.make_node("Add", [inp1_mul.name, inp2_mul.name], [eltwise_add.name]) - globalavgpool_node = helper.make_node("GlobalAveragePool", [eltwise_add.name], [pool.name]) - reshape_node = helper.make_node("Reshape", [pool.name, reshape_ct.name], [outp.name]) - - graph = helper.make_graph( - nodes=[ - add0_node, - add1_node, - add2_node, - mul1_node, - mul2_node, - eltwise_add_node, - globalavgpool_node, - reshape_node, - ], - name="graph", - inputs=[inp], - outputs=[outp], - ) - - model = qonnx_make_model(graph, producer_name="add-model") - model = ModelWrapper(model) - - # set initializers for scalar add/mul nodes - model.set_initializer(add0_node.input[1], np.array([0.0], dtype=np.float32)) - model.set_initializer(add1_node.input[1], np.array([7.0], dtype=np.float32)) - model.set_initializer(add2_node.input[1], np.array([8.0], dtype=np.float32)) - model.set_initializer(mul1_node.input[1], np.array([2.0], dtype=np.float32)) - model.set_initializer(mul2_node.input[1], np.array([2.0], dtype=np.float32)) - model.set_initializer(reshape_node.input[1], np.array([1, -1], dtype=np.int64)) - - return model - - -# data types -@pytest.mark.parametrize("idt", [DataType["UINT2"]]) -# channels -@pytest.mark.parametrize("ch", [16]) -# ifmdim -@pytest.mark.parametrize("ifmdim", [5]) -@pytest.mark.fpgadataflow -@pytest.mark.vivado -@pytest.mark.slow -def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): - model = make_model(ch, ifmdim) - model.save(export_onnx_path) - model = ModelWrapper(export_onnx_path, fix_float64=True) - model = model.transform(InferShapes()) - model = model.transform(FoldConstants()) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(GiveReadableTensorNames()) - model = model.transform(InferDataLayouts()) - # model.save("golden.onnx") - # generate test vectors of correct shape - if ifmdim == -1: - input_tensor_shape = (1, ch) - else: - input_tensor_shape = (1, ch, ifmdim, ifmdim) - - x = gen_finn_dt_tensor(idt, input_tensor_shape) - - # generate expected value from streamlined net - input_dict = {model.graph.input[0].name: x} - - output_dict = oxe.execute_onnx(model, input_dict, True) - produced_sum = output_dict[model.graph.output[0].name] - chw_mul = model.get_initializer(model.graph.node[-1].input[1]) - chw_mul = 1 - expected_sum = chw_mul * np.sum(2 * (2 * x + 15.0), axis=(2, 3)) / (ifmdim * ifmdim) - assert (produced_sum.flatten() == expected_sum.flatten()).all() - - model = model.transform(InferDataLayouts()) - - # convert to hls - model.set_tensor_datatype(model.graph.input[0].name, idt) - # extra streamlining - model = model.transform(MoveScalarLinearPastInvariants()) - model = model.transform(MoveAddPastMul()) - model = model.transform(CollapseRepeatedMul()) - model = model.transform(CollapseRepeatedAdd()) - # insert top-k node, which should absorb linear ops before it - - model = model.transform(InferShapes()) - model = model.transform(InferDataLayouts()) - model = model.transform(InferDataTypes()) - - model = model.transform(to_hls.InferChannelwiseLinearLayer()) - model = model.transform(to_hls.InferAddStreamsLayer()) - model = model.transform(to_hls.InferGlobalAccPoolLayer()) - model = model.transform(MoveScalarLinearPastInvariants()) - model = model.transform(InsertTopK()) - model = model.transform(AbsorbScalarMulAddIntoTopK()) - model = model.transform(InferDataTypes()) - model = model.transform(to_hls.InferLabelSelectLayer()) - model = model.transform(AbsorbConsecutiveTransposes()) - model = model.transform(InferDataTypes()) - model = model.transform(to_hls.InferLabelSelectLayer()) - model = model.transform(to_hls.InferDuplicateStreamsLayer()) - - model = model.transform(SortGraph()) - - # model.save("golden_hls.onnx") - # check topology status - - finn_nodes = model.get_finn_nodes() - assert len(finn_nodes) == 9 - add_nodes = model.get_nodes_by_op_type("AddStreams_Batch") - assert len(add_nodes) == 1 - pool_nodes = model.get_nodes_by_op_type("GlobalAccPool_Batch") - assert len(pool_nodes) == 1 - label_nodes = model.get_nodes_by_op_type("LabelSelect_Batch") - assert len(label_nodes) == 1 - channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp_Batch") - assert len(channelwise_nodes) == 5 - dup_nodes = model.get_nodes_by_op_type("DuplicateStreams_Batch") - assert len(dup_nodes) == 1 - - model = model.transform(PrepareCppSim()) - model = model.transform(CompileCppSim()) - model = model.transform(SetExecMode("cppsim")) - - output_dict = oxe.execute_onnx(model, input_dict, True) - produced_topk_hls = output_dict[model.graph.output[0].name] - topk_input = output_dict[model.graph.node[-1].input[0]] - assert soft_verify_topk(topk_input, produced_topk_hls, 5) - - os.remove(export_onnx_path) diff --git a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py index 32ec229334..55f46e321b 100644 --- a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py @@ -74,7 +74,7 @@ @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): +def test_convert_to_hw_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): pad, kernel_size, stride, dilation = conv_config np.random.seed(0) idt = DataType["UINT4"] diff --git a/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py index 59d65c820d..f7b3c55c2a 100755 --- a/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py @@ -82,7 +82,7 @@ def get_multithreshold_rand_params(channels, num_of_thres, seed=None): @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape): +def test_convert_to_hw_conv_fc_transition(conv_config, depthwise, use_reshape): np.random.seed(0) idt = DataType["UINT4"] odt = DataType["UINT4"] diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py similarity index 86% rename from tests/fpgadataflow/test_convert_to_hls_conv_layer.py rename to tests/fpgadataflow/test_convert_to_hw_conv_layer.py index 95beffafac..8cade1bfa1 100644 --- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -41,7 +42,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -49,6 +50,8 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.fpgadataflow import is_fpgadataflow_node # conv_config kernel_size,stride, pad @@ -62,7 +65,7 @@ @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): +def test_convert_to_hw_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): kernel_size, stride, pad = conv_config np.random.seed(0) idt = DataType["UINT4"] @@ -71,7 +74,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod in_chn = 16 if use_rtl_swg and exec_mode == "cppsim": - pytest.skip("cppsim not supported for RTL SWG") + pytest.skip("Skip cppsim if SWG in rtl") if depthwise is True: group = out_chn = in_chn @@ -120,12 +123,19 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) + new_model = new_model.transform(to_hw.InferConvInpGen()) + if not use_rtl_swg: + for node in new_model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") if depthwise is True: - new_model = new_model.transform(to_hls.InferVectorVectorActivation()) + new_model = new_model.transform(to_hw.InferVectorVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) else: - new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation")[0] + new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) + fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation_hls")[0] fc_inst = getCustomOp(fc_node) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") @@ -156,9 +166,9 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod assert oxe.compare_execution(model, new_model, inp_dict) if not use_rtl_swg and kernel_size == 1 and stride > 1 and pad == 0: - assert new_model.graph.node[1].op_type == "DownSampler" + assert new_model.graph.node[1].op_type == "DownSampler_hls" if exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("DownSampler")[0] + node = new_model.get_nodes_by_op_type("DownSampler_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) @@ -170,12 +180,12 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod if use_rtl_swg: padding_node = new_model.get_nodes_by_op_type("FMPadding_rtl")[0] else: - padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0] + padding_node = new_model.get_nodes_by_op_type("FMPadding_hls")[0] padding_inst = getCustomOp(padding_node) assert padding_inst.get_nodeattr("SIMD") == in_chn if depthwise is True and exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("VectorVectorActivation")[0] + node = new_model.get_nodes_by_op_type("VectorVectorActivation_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py similarity index 84% rename from tests/fpgadataflow/test_convert_to_hls_layers_cnv.py rename to tests/fpgadataflow/test_convert_to_hw_layers_cnv.py index c9cb4f0802..117a9a5850 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -48,24 +49,25 @@ from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.transformation.streamline.reorder import MakeMaxPoolNHWC from finn.util.test import get_test_model_trained -export_onnx_path_cnv = "test_convert_to_hls_layers_cnv.onnx" +export_onnx_path_cnv = "test_convert_to_hw_layers_cnv.onnx" @pytest.mark.fpgadataflow @pytest.mark.vivado # Standalone or fused thresholding-based activation @pytest.mark.parametrize("fused_activation", [True, False]) -def test_convert_to_hls_layers_cnv_w1a1(fused_activation): +def test_convert_to_hw_layers_cnv_w1a1(fused_activation): cnv = get_test_model_trained("CNV", 1, 1) export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path_cnv) qonnx_cleanup(export_onnx_path_cnv, out_file=export_onnx_path_cnv) @@ -95,14 +97,21 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation): expected_ctx = oxe.execute_onnx(model, input_dict, True) expected = expected_ctx[model.graph.output[0].name] - # if we infer thresholding first, all MultiThresholds get converted to HLS + # if we infer thresholding first, all MultiThresholds get converted to HW # subsequently, the FC inference will generate passthrough MVAUs if not fused_activation: - model = model.transform(to_hls.InferThresholdingLayer()) - model = model.transform(to_hls.InferBinaryMatrixVectorActivation()) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferThresholdingLayer()) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferConvInpGen()) + conv_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator") + for cnv in conv_nodes: + cnv_inst = getCustomOp(cnv) + cnv_inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(to_hw.InferStreamingMaxPool()) + model = model.transform(SpecializeLayers()) for node in model.graph.node: - if node.op_type == "MatrixVectorActivation": + if node.op_type == "MatrixVectorActivation_hls": inst = getCustomOp(node) inst.set_nodeattr("mem_mode", "decoupled") mw = inst.get_nodeattr("MW") @@ -117,25 +126,23 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation): else: simd = mw inst.set_nodeattr("SIMD", simd) - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) # check topology status finn_nodes = model.get_finn_nodes() if fused_activation: assert len(finn_nodes) == 18 else: assert len(finn_nodes) == 26 - thr_nodes = model.get_nodes_by_op_type("Thresholding_Batch") + thr_nodes = model.get_nodes_by_op_type("Thresholding_hls") assert len(thr_nodes) == 8 non_finn_nodes = model.get_non_finn_nodes() assert len(non_finn_nodes) == 5 exp_non_finn_nodes = ["Transpose", "Transpose", "Reshape", "Mul", "Add"] assert [x.op_type for x in non_finn_nodes] == exp_non_finn_nodes - fc_nodes = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_nodes = model.get_nodes_by_op_type("MatrixVectorActivation_hls") assert len(fc_nodes) == 9 - swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") assert len(swg_nodes) == 6 - mp_nodes = model.get_nodes_by_op_type("StreamingMaxPool_Batch") + mp_nodes = model.get_nodes_by_op_type("StreamingMaxPool_hls") assert len(mp_nodes) == 2 model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py similarity index 88% rename from tests/fpgadataflow/test_convert_to_hls_layers_fc.py rename to tests/fpgadataflow/test_convert_to_hw_layers_fc.py index 8a7b2509a4..13f6a4393e 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -48,22 +49,23 @@ from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds from finn.util.test import get_test_model_trained -export_onnx_path = "test_convert_to_hls_layers_fc.onnx" +export_onnx_path = "test_convert_to_hw_layers_fc.onnx" @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_convert_to_hls_layers_tfc_w1a1(): +def test_convert_to_hw_layers_tfc_w1a1(): tfc = get_test_model_trained("TFC", 1, 1) export_qonnx(tfc, torch.randn(1, 1, 28, 28), export_onnx_path) qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) @@ -79,24 +81,25 @@ def test_convert_to_hls_layers_tfc_w1a1(): model = model.transform(absorb.AbsorbAddIntoMultiThreshold()) model = model.transform(absorb.AbsorbMulIntoMultiThreshold()) model = model.transform(RoundAndClipThresholds()) - model = model.transform(to_hls.InferBinaryMatrixVectorActivation()) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) + model = model.transform(SpecializeLayers()) fc0 = model.graph.node[2] - assert fc0.op_type == "MatrixVectorActivation" + assert fc0.op_type == "MatrixVectorActivation_hls" assert model.get_tensor_shape(fc0.input[0]) == [1, 784] assert model.get_tensor_shape(fc0.input[1]) == [784, 64] assert model.get_tensor_shape(fc0.input[2]) == [64, 1] fc1 = model.graph.node[3] - assert fc1.op_type == "MatrixVectorActivation" + assert fc1.op_type == "MatrixVectorActivation_hls" assert model.get_tensor_shape(fc1.input[0]) == [1, 64] assert model.get_tensor_shape(fc1.input[1]) == [64, 64] assert model.get_tensor_shape(fc1.input[2]) == [64, 1] fc2 = model.graph.node[4] - assert fc2.op_type == "MatrixVectorActivation" + assert fc2.op_type == "MatrixVectorActivation_hls" assert model.get_tensor_shape(fc2.input[0]) == [1, 64] assert model.get_tensor_shape(fc2.input[1]) == [64, 64] assert model.get_tensor_shape(fc2.input[2]) == [64, 1] fc3 = model.graph.node[5] - assert fc3.op_type == "MatrixVectorActivation" + assert fc3.op_type == "MatrixVectorActivation_hls" assert model.get_tensor_shape(fc3.input[0]) == [1, 64] assert model.get_tensor_shape(fc3.input[1]) == [64, 10] @@ -137,7 +140,7 @@ def test_convert_to_hls_layers_tfc_w1a1(): @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_convert_to_hls_layers_tfc_w1a2(): +def test_convert_to_hw_layers_tfc_w1a2(): tfc = get_test_model_trained("TFC", 1, 2) export_qonnx(tfc, torch.randn(1, 1, 28, 28), export_onnx_path) qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) @@ -150,29 +153,26 @@ def test_convert_to_hls_layers_tfc_w1a2(): model = model.transform(GiveUniqueParameterTensors()) model = model.transform(GiveReadableTensorNames()) model = model.transform(Streamline()) - from finn.transformation.fpgadataflow.convert_to_hls_layers import ( - InferQuantizedMatrixVectorActivation, - ) - - model = model.transform(InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(SpecializeLayers()) fc0 = model.graph.node[2] - assert fc0.op_type == "MatrixVectorActivation" + assert fc0.op_type == "MatrixVectorActivation_hls" assert model.get_tensor_shape(fc0.input[0]) == [1, 784] assert model.get_tensor_shape(fc0.input[1]) == [784, 64] assert model.get_tensor_shape(fc0.input[2]) == [64, 2] fc1 = model.graph.node[3] - assert fc1.op_type == "MatrixVectorActivation" + assert fc1.op_type == "MatrixVectorActivation_hls" assert model.get_tensor_shape(fc1.input[0]) == [1, 64] assert model.get_tensor_shape(fc1.input[1]) == [64, 64] assert model.get_tensor_shape(fc1.input[2]) == [64, 2] fc2 = model.graph.node[4] - assert fc2.op_type == "MatrixVectorActivation" + assert fc2.op_type == "MatrixVectorActivation_hls" assert model.get_tensor_shape(fc2.input[0]) == [1, 64] assert model.get_tensor_shape(fc2.input[1]) == [64, 64] assert model.get_tensor_shape(fc2.input[2]) == [64, 2] fc3 = model.graph.node[5] - assert fc3.op_type == "MatrixVectorActivation" + assert fc3.op_type == "MatrixVectorActivation_hls" assert model.get_tensor_shape(fc3.input[0]) == [1, 64] assert model.get_tensor_shape(fc3.input[1]) == [64, 10] fc0w = getCustomOp(fc0) diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py index 02a53485ad..6c83f10617 100644 --- a/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -68,7 +68,7 @@ export_onnx_path = "test_output_synthetic.onnx" # construct a synthetic graph to test: -# topk insertion, topk conversion to hls, add conversion to hls +# topk insertion, topk conversion to hw, add conversion to hw # graph should just be a sum @@ -137,7 +137,7 @@ def make_model(ch, ifmdim): @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): +def test_convert_to_hw_layers_synthetic(ch, ifmdim, idt): model = make_model(ch, ifmdim) model.save(export_onnx_path) model = ModelWrapper(export_onnx_path, fix_float64=True) @@ -166,7 +166,7 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = model.transform(InferDataLayouts()) - # convert to hls + # convert to hw model.set_tensor_datatype(model.graph.input[0].name, idt) # extra streamlining model = model.transform(MoveScalarLinearPastInvariants()) diff --git a/tests/fpgadataflow/test_convert_to_hw_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py index dffc5c4642..685c955f4e 100755 --- a/tests/fpgadataflow/test_convert_to_hw_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hw_thresholding.py @@ -129,7 +129,7 @@ def test_convert_multithreshold_to_hardware( pe = generate_pe_value(fold, num_input_channels) num_steps = activation.get_num_possible_values() - 1 - # See convert_to_hls_layers::InferThresholdingLayer: + # See convert_to_hw_layers::InferThresholdingLayer: # assert (not odt.signed()) or (actval < 0) # This implies that it expects a negative activation, BIPOLAR does not provide that if activation == DataType["BIPOLAR"]: diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py index 2ffd696528..6ad8618981 100644 --- a/tests/fpgadataflow/test_depthwise_convolution.py +++ b/tests/fpgadataflow/test_depthwise_convolution.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,7 +46,7 @@ import finn.core.onnx_exec as oxe from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import ( +from finn.transformation.fpgadataflow.convert_to_hw_layers import ( InferConvInpGen, InferVectorVectorActivation, ) @@ -54,6 +55,8 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.fpgadataflow import is_fpgadataflow_node def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): @@ -166,7 +169,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding): +def test_depthwise_conv_hw_cppsim(act, pe, k, stride, padding): idt = wdt = DataType["INT4"] ifm_dim = 6 ifm_ch = 4 @@ -179,14 +182,21 @@ def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding): new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVectorVectorActivation()) + # for cppsim set all layers to preferred impl style = "hls" + for node in new_model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") + + new_model = new_model.transform(SpecializeLayers()) # set SIMD in ConvInputGen node and PE in VVAU node for n in new_model.graph.node: - if n.op_type == "ConvolutionInputGenerator": + if n.op_type == "ConvolutionInputGenerator_hls": convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", pe) - elif n.op_type == "VectorVectorActivation": + elif n.op_type == "VectorVectorActivation_hls": vvau_node = getCustomOp(n) vvau_node.set_nodeattr("PE", pe) new_model = new_model.transform(SetExecMode("cppsim")) @@ -209,7 +219,7 @@ def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding): +def test_depthwise_conv_hw_rtlsim(act, pe, k, stride, padding): idt = wdt = DataType["INT4"] ifm_dim = 6 ifm_ch = 4 @@ -223,13 +233,14 @@ def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding): new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVectorVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) # set SIMD in ConvInputGen node and PE in VVAU node for n in new_model.graph.node: - if n.op_type == "ConvolutionInputGenerator": + if n.op_type == "ConvolutionInputGenerator_rtl": convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", pe) - elif n.op_type == "VectorVectorActivation": + elif n.op_type == "VectorVectorActivation_hls": vvau_node = getCustomOp(n) vvau_node.set_nodeattr("PE", pe) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py index ee37ab86ef..a05dd53e28 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, Advanced Micro Devices, Inc. +# Copyright (c) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -48,7 +48,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.core.onnx_exec import execute_onnx from finn.core.rtlsim_exec import rtlsim_exec @@ -249,10 +249,11 @@ def test_fpgadataflow_conv_dynamic(cfg): # convert to hardware and prepare simulation model = largest_model.transform(LowerConvsToMatMul()) - model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True)) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) - model = model.transform(to_hls.InferVectorVectorActivation()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferVectorVectorActivation()) model = model.transform(absorb.AbsorbConsecutiveTransposes()) + model = model.transform(SpecializeLayers()) parent_model = model.transform(CreateDataflowPartition()) sdp_inst = getCustomOp(parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]) model = ModelWrapper(sdp_inst.get_nodeattr("model")) @@ -268,8 +269,8 @@ def test_fpgadataflow_conv_dynamic(cfg): getCustomOp(swg_node).set_nodeattr("dynamic_mode", 1) getCustomOp(swg_node).set_nodeattr("inFIFODepths", [16]) getCustomOp(swg_node).set_nodeattr("outFIFODepths", [16]) - comp_nodes = model.get_nodes_by_op_type("MatrixVectorActivation") - comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation") + comp_nodes = model.get_nodes_by_op_type("MatrixVectorActivation_hls") + comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation_hls") for comp_node in comp_nodes: if depthwise: getCustomOp(comp_node).set_nodeattr("PE", 4) @@ -278,6 +279,7 @@ def test_fpgadataflow_conv_dynamic(cfg): getCustomOp(comp_node).set_nodeattr("PE", 4) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) From 0fe2e30cfb7c1e804756651c7b07705bca971b6d Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 9 Feb 2024 14:20:04 +0000 Subject: [PATCH 120/291] [Tests] Update infer data layout test --- tests/transformation/test_infer_data_layouts_cnv.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/transformation/test_infer_data_layouts_cnv.py b/tests/transformation/test_infer_data_layouts_cnv.py index 2d7fc54f94..6b6674d661 100644 --- a/tests/transformation/test_infer_data_layouts_cnv.py +++ b/tests/transformation/test_infer_data_layouts_cnv.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,7 +46,7 @@ from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul from qonnx.util.cleanup import cleanup as qonnx_cleanup -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline @@ -101,10 +102,10 @@ def test_infer_data_layouts_cnv(): model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) model = model.transform(ConvertBipolarMatMulToXnorPopcount()) model = model.transform(Streamline()) - model = model.transform(to_hls.InferBinaryMatrixVectorActivation()) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferStreamingMaxPool()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataLayouts()) From 40cfe01c70173e43a6222f229d925d5944fc6958 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 9 Feb 2024 16:04:36 +0000 Subject: [PATCH 121/291] [Builder/Transform] Update builder and transformations according to new flow --- .../analysis/fpgadataflow/res_estimation.py | 2 +- src/finn/builder/build_dataflow_config.py | 35 +- src/finn/builder/build_dataflow_steps.py | 94 +- .../custom_op/fpgadataflow/streamingfifo.py | 2 +- src/finn/qnn-data/build_dataflow/build.py | 1 + .../specialize_layers_config.json | 30 + .../fpgadataflow/convert_to_hls_layers.py | 1782 ----------------- .../fpgadataflow/create_stitched_ip.py | 6 +- .../fpgadataflow/derive_characteristic.py | 6 +- .../transformation/fpgadataflow/floorplan.py | 2 +- .../transformation/fpgadataflow/insert_dwc.py | 2 +- .../fpgadataflow/insert_fifo.py | 12 +- .../fpgadataflow/insert_tlastmarker.py | 2 +- .../fpgadataflow/set_fifo_depths.py | 25 +- .../fpgadataflow/set_folding.py | 33 +- .../fpgadataflow/specialize_layers.py | 5 - src/finn/transformation/move_reshape.py | 2 +- src/finn/util/pyverilator.py | 2 +- tests/end2end/test_end2end_bnn_pynq.py | 3 - tests/util/test_build_dataflow.py | 1 + 20 files changed, 164 insertions(+), 1883 deletions(-) create mode 100644 src/finn/qnn-data/build_dataflow/specialize_layers_config.json delete mode 100644 src/finn/transformation/fpgadataflow/convert_to_hls_layers.py diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index d48c423b9d..000e1208d7 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -72,7 +72,7 @@ def res_estimation_complete(model): inst.set_nodeattr("resType", "lut") res_dict[node.name].append(inst.node_res_estimation()) inst.set_nodeattr("resType", orig_restype) - elif op_type == "ConvolutionInputGenerator": + elif op_type.startswith("ConvolutionInputGenerator"): orig_ramstyle = inst.get_nodeattr("ram_style") res_dict[node.name] = [] inst.set_nodeattr("ram_style", "block") diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index e4fed05731..1b22265a4d 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -1,4 +1,5 @@ # Copyright (c) 2020 Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -115,14 +116,15 @@ class VerificationStepType(str, Enum): "step_qonnx_to_finn", "step_tidy_up", "step_streamline", - "step_convert_to_hls", + "step_convert_to_hw", "step_create_dataflow_partition", + "step_specialize_layers", "step_target_fps_parallelization", "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", - "step_hls_codegen", - "step_hls_ipgen", + "step_hw_codegen", + "step_hw_ipgen", "step_set_fifo_depths", "step_create_stitched_ip", "step_measure_rtlsim_performance", @@ -137,17 +139,18 @@ class VerificationStepType(str, Enum): "step_qonnx_to_finn", "step_tidy_up", "step_streamline", - "step_convert_to_hls", + "step_convert_to_hw", "step_create_dataflow_partition", + "step_specialize_layers", "step_target_fps_parallelization", "step_apply_folding_config", "step_minimize_bit_width", "step_generate_estimate_reports", ] -#: List of steps to run for a dataflow build including HLS code generation, but +#: List of steps to run for a dataflow build including HW code generation, but #: without any synthesis. -hls_codegen_dataflow_steps = estimate_only_dataflow_steps + ["step_hls_codegen"] +hw_codegen_dataflow_steps = estimate_only_dataflow_steps + ["step_hw_codegen"] @dataclass_json @@ -170,6 +173,14 @@ class DataflowBuildConfig: #: DataflowOutputType for available options. generate_outputs: List[DataflowOutputType] + #: (Optional) Path to configuration JSON file in which user can specify + #: a preferred implementation style (HLS or RTL) for each node. + #: The SpecializeLayers transformation picks up these settings and if possible + #: fulfills the desired implementation style for each layer by converting the + #: node into its HLS or RTL variant. + #: Will be applied with :py:mod:`qonnx.transformation.general.ApplyConfig` + specialize_layers_config_file: Optional[str] = None + #: (Optional) Path to configuration JSON file. May include parallelization, #: FIFO sizes, RAM and implementation style attributes and so on. #: If the parallelization attributes (PE, SIMD) are part of the config, @@ -230,7 +241,7 @@ class DataflowBuildConfig: mvau_wwidth_max: Optional[int] = 36 #: (Optional) Whether thresholding layers (which implement quantized - #: activations in FINN) will be implemented as stand-alone HLS layers, + #: activations in FINN) will be implemented as stand-alone HW layers, #: instead of being part of MatrixVectorActivation layer. This gives larger #: flexibility, and makes it possible to have runtime-writable thresholds. standalone_thresholds: Optional[bool] = False @@ -277,7 +288,7 @@ class DataflowBuildConfig: #: Only relevant when `auto_fifo_depths = True` large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO - #: Target clock frequency (in nanoseconds) for Vivado HLS synthesis. + #: Target clock frequency (in nanoseconds) for Vitis HLS synthesis. #: e.g. `hls_clk_period_ns=5.0` will target a 200 MHz clock. #: If not specified it will default to synth_clk_period_ns hls_clk_period_ns: Optional[float] = None @@ -285,10 +296,6 @@ class DataflowBuildConfig: #: Which memory mode will be used for compute layers default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED - #: Force inference of RTL ConvolutionInputGenerator over HLS implementation - #: If set to False, falls back to the default behavior of InferConvInpGen() - force_rtl_conv_inp_gen: Optional[bool] = False - #: Which Vitis platform will be used. #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO` #: e.g. "xilinx_u250_xdma_201830_2" @@ -347,8 +354,8 @@ class DataflowBuildConfig: #: Override the number of inputs for rtlsim performance measurement. rtlsim_batch_size: Optional[int] = 1 - #: If set to True, FIFOs and DWCs with impl_style=vivado will be kept during - #: rtlsim, otherwise they will be replaced by HLS implementations. + #: If set to True, FIFOs with impl_style=vivado will be kept during + #: rtlsim, otherwise they will be replaced by RTL implementations. rtlsim_use_vivado_comps: Optional[bool] = True def _resolve_hls_clk_period(self): diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 54ba7e4ea1..d031e971f1 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -1,4 +1,5 @@ # Copyright (c) 2020 Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -52,7 +53,7 @@ from qonnx.util.config import extract_model_config_to_json from shutil import copy -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer @@ -108,6 +109,7 @@ SplitLargeFIFOs, ) from finn.transformation.fpgadataflow.set_folding import SetFolding +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext from finn.transformation.fpgadataflow.vitis_build import VitisBuild from finn.transformation.move_reshape import RemoveCNVtoFCFlatten @@ -216,23 +218,15 @@ def verify_step( def prepare_for_stitched_ip_rtlsim(verify_model, cfg): if not cfg.rtlsim_use_vivado_comps: need_restitch = False - # switch impl_style=vivado components to rtl/hls + # switch impl_style=vivado components to rtl # StreamingFIFO must have impl_style=rtl - for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): + for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO_rtl"): inst = getCustomOp(fifo_layer) if inst.get_nodeattr("impl_style") != "rtl": inst.set_nodeattr("impl_style", "rtl") inst.set_nodeattr("code_gen_dir_ipgen", "") inst.set_nodeattr("ipgen_path", "") need_restitch = True - # StreamingDataWidthConverter must have impl_style=hls - for dwc_layer in verify_model.get_nodes_by_op_type("StreamingDataWidthConverter_Batch"): - inst = getCustomOp(dwc_layer) - if inst.get_nodeattr("impl_style") != "hls": - inst.set_nodeattr("impl_style", "hls") - inst.set_nodeattr("code_gen_dir_ipgen", "") - inst.set_nodeattr("ipgen_path", "") - need_restitch = True # if we've made alterations to the model, need to do some re-prep if need_restitch: print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM") @@ -336,43 +330,43 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): return model -def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig): - """Convert eligible nodes to `HLSCustomOp` subclasses that represent HLS - layers. Which nodes and particular configurations can be converted to HLS - is limited, see the source code of the `convert_to_hls` module for more.""" +def step_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): + """Convert eligible nodes to `HWCustomOp` subclasses that represent HW + layers. Which nodes and particular configurations can be converted to HW + is limited, see the source code of the `convert_to_hw` module for more. + In the end am empty json file is created which can be used to set user specific + preferred implementation styles for each node.""" mem_mode = cfg.default_mem_mode.value if cfg.standalone_thresholds: # doing this first causes all threshold layers to be standalone - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for bipolar MatMul layers - model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation(mem_mode)) # needed for non-bipolar MatMul layers - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation(mem_mode)) # TopK to LabelSelect - model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) # input quantization (if any) as standalone threshold - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for convolutions -- TODO always exec? need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0 if need_conv: - if cfg.force_rtl_conv_inp_gen: - model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True)) - else: - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferStreamingMaxPool()) model = model.transform(RemoveCNVtoFCFlatten()) # get rid of Tranpose -> Tranpose identity seq model = model.transform(absorb.AbsorbConsecutiveTransposes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(InferDataLayouts()) + return model def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig): - """Separate consecutive groups of HLSCustomOp nodes into StreamingDataflowPartition + """Separate consecutive groups of HWCustomOp nodes into StreamingDataflowPartition nodes, which point to a separate ONNX file. Dataflow accelerator synthesis - can only be performed on those HLSCustomOp sub-graphs.""" + can only be performed on those HWCustomOp sub-graphs.""" parent_model = model.transform( CreateDataflowPartition( @@ -387,6 +381,31 @@ def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig if cfg.save_intermediate_models: parent_model.save(cfg.output_dir + "/intermediate_models/dataflow_parent.onnx") model = ModelWrapper(dataflow_model_filename) + + # create a configuration json file that can be used to set the specialize layer config + attrs = [ + "preferred_impl_style", + ] + extract_model_config_to_json( + model, cfg.output_dir + "/template_specialize_layers_config.json", attrs + ) + + return model + + +def step_specialize_layers(model: ModelWrapper, cfg: DataflowBuildConfig): + """Convert HW nodes to either an HLS or RTL variant of the node. HW nodes + get converted either based on pre-determined rules (details can be found + in `specialize_layers` source code) or the user provides a configuration file + which contains the desired setting. If the user preference cannot be fulfilled, + a warning will be printed and the implementation style will be set to a default.""" + + if cfg.specialize_layers_config_file is not None: + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(ApplyConfig(cfg.specialize_layers_config_file)) + model = model.transform(SpecializeLayers()) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) return model @@ -482,16 +501,17 @@ def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): return model -def step_hls_codegen(model: ModelWrapper, cfg: DataflowBuildConfig): - "Generate Vivado HLS code to prepare HLSCustomOp nodes for IP generation." +def step_hw_codegen(model: ModelWrapper, cfg: DataflowBuildConfig): + """Generate Vitis HLS code to prepare HLSBackend nodes for IP generation. + And fills RTL templates for RTLBackend nodes.""" model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())) return model -def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): - """Run Vivado HLS synthesis on generated code for HLSCustomOp nodes, - in order to generate IP blocks.""" +def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): + """Run Vitis HLS synthesis on generated code for HLSBackend nodes, + in order to generate IP blocks. For RTL nodes this step does not do anything.""" model = model.transform(HLSSynthIP()) model = model.transform(ReplaceVerilogRelPaths()) @@ -519,6 +539,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): if cfg.auto_fifo_depths: if cfg.auto_fifo_strategy == "characterize": model = model.transform(InsertDWC()) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform( PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) @@ -536,6 +557,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): create_shallow_fifos=True, ) ) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) elif cfg.auto_fifo_strategy == "largefifo_rtlsim": @@ -566,6 +588,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): # need to make sure all FIFOs are created so that their depth can be # set by ApplyConfig, so create_shallow_fifos=True model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) if cfg.folding_config_file is not None: @@ -823,14 +846,15 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig): "step_qonnx_to_finn": step_qonnx_to_finn, "step_tidy_up": step_tidy_up, "step_streamline": step_streamline, - "step_convert_to_hls": step_convert_to_hls, + "step_convert_to_hw": step_convert_to_hw, + "step_specialize_layers": step_specialize_layers, "step_create_dataflow_partition": step_create_dataflow_partition, "step_target_fps_parallelization": step_target_fps_parallelization, "step_apply_folding_config": step_apply_folding_config, "step_minimize_bit_width": step_minimize_bit_width, "step_generate_estimate_reports": step_generate_estimate_reports, - "step_hls_codegen": step_hls_codegen, - "step_hls_ipgen": step_hls_ipgen, + "step_hw_codegen": step_hw_codegen, + "step_hw_ipgen": step_hw_ipgen, "step_set_fifo_depths": step_set_fifo_depths, "step_create_stitched_ip": step_create_stitched_ip, "step_measure_rtlsim_performance": step_measure_rtlsim_performance, diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index b55af929ed..1556575b00 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -104,7 +104,7 @@ def get_verilog_top_module_intf_names(self): def get_normal_input_shape(self, ind=0): depth = self.get_adjusted_depth() - assert depth >= 2, """Depth is too low""" + assert depth >= 1, """Depth is too low""" if depth > 256 and self.get_nodeattr("impl_style") == "rtl": warnings.warn("Depth is high, set between 2 and 256 for efficient SRL implementation") return self.get_nodeattr("normal_shape") diff --git a/src/finn/qnn-data/build_dataflow/build.py b/src/finn/qnn-data/build_dataflow/build.py index 0d9d55a086..13d58d2c91 100644 --- a/src/finn/qnn-data/build_dataflow/build.py +++ b/src/finn/qnn-data/build_dataflow/build.py @@ -43,6 +43,7 @@ mvau_wwidth_max=10000, # can specify detailed folding/FIFO/etc config with: # folding_config_file="folding_config.json", + specialize_layers_config_file="specialize_layers_config.json", synth_clk_period_ns=10.0, board=platform_name, shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, diff --git a/src/finn/qnn-data/build_dataflow/specialize_layers_config.json b/src/finn/qnn-data/build_dataflow/specialize_layers_config.json new file mode 100644 index 0000000000..4fc37896db --- /dev/null +++ b/src/finn/qnn-data/build_dataflow/specialize_layers_config.json @@ -0,0 +1,30 @@ +{ + "Defaults": {}, + "Thresholding_0": { + "preferred_impl_style": "hls" + }, + "MatrixVectorActivation_0": { + "preferred_impl_style": "hls" + }, + "Thresholding_1": { + "preferred_impl_style": "" + }, + "MatrixVectorActivation_1": { + "preferred_impl_style": "" + }, + "Thresholding_2": { + "preferred_impl_style": "" + }, + "MatrixVectorActivation_2": { + "preferred_impl_style": "" + }, + "Thresholding_3": { + "preferred_impl_style": "rtl" + }, + "MatrixVectorActivation_3": { + "preferred_impl_style": "" + }, + "LabelSelect_0": { + "preferred_impl_style": "hls" + } +} diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py deleted file mode 100644 index ef02453498..0000000000 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ /dev/null @@ -1,1782 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -import numpy as np -import qonnx.core.data_layout as DataLayout -import warnings -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.base import Transformation -from qonnx.transformation.general import SortGraph -from qonnx.transformation.infer_datatypes import InferDataTypes -from qonnx.transformation.infer_shapes import InferShapes -from qonnx.util.basic import get_by_name -from qonnx.util.onnx import nchw_to_nhwc - - -class InferConvInpGen(Transformation): - """Convert Im2Col layers to ConvolutionInputGenerator layers.""" - - def __init__(self, use_rtl_variant=False): - super().__init__() - self.use_rtl_variant = use_rtl_variant - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for n in graph.node: - node_ind += 1 - if n.op_type == "Im2Col": - i2c_input = n.input[0] - i2c_output = n.output[0] - i2c_in_shape = model.get_tensor_shape(i2c_input) - i2c_out_shape = model.get_tensor_shape(i2c_output) - dt = model.get_tensor_datatype(i2c_input) - if not dt.is_integer(): - warnings.warn("%s : Input is not int. Can't infer ConvInpGen." % n.name) - continue - i2c_inst = getCustomOp(n) - stride_h, stride_w = i2c_inst.get_nodeattr("stride") - k_h, k_w = i2c_inst.get_nodeattr("kernel_size") - pad_attr = i2c_inst.get_nodeattr("pad_amount") - pad_h = pad_attr[0] + pad_attr[2] - pad_w = pad_attr[1] + pad_attr[3] - dilation_h, dilation_w = i2c_inst.get_nodeattr("dilations") - # temporary checks until non-square conv support is finalized - pad_val = i2c_inst.get_nodeattr("pad_value") - depthwise = i2c_inst.get_nodeattr("depthwise") - ifm_ch = i2c_in_shape[-1] - ifm_dim_h = i2c_in_shape[1] - ifm_dim_w = i2c_in_shape[2] - ofm_dim_h = i2c_out_shape[1] - ofm_dim_w = i2c_out_shape[2] - - # default params for ConvolutionInputGenerator - ConvInpGen_node_idx = node_ind - ConvInpGen_input = i2c_input - ConvInpGen_idim_h = ifm_dim_h - ConvInpGen_idim_w = ifm_dim_w - - if pad_h > 0 or pad_w > 0: - # if padding enabled, ensure pad_val supported by DataType - # assert dt.allowed(pad_val),"""FMPadding_Batch DataType - # must support pad_val""" - assert pad_val == 0, ( - "%s : FMPadding_Batch doesn't currently support pad_val!= 0" % n.name - ) - - odim_padding_h = ifm_dim_h + pad_h - odim_padding_w = ifm_dim_w + pad_w - - padding_out = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - (1, odim_padding_h, odim_padding_w, ifm_ch), - ) - graph.value_info.append(padding_out) - padding_out = padding_out.name - model.set_tensor_datatype(padding_out, dt) - - ConvInpGen_node_idx += 1 - ConvInpGen_input = padding_out - ConvInpGen_idim_h = odim_padding_h - ConvInpGen_idim_w = odim_padding_w - - padding_optype = "FMPadding_rtl" if self.use_rtl_variant else "FMPadding_Batch" - - padding_node = helper.make_node( - padding_optype, - [i2c_input], - [padding_out], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ImgDim=[ifm_dim_h, ifm_dim_w], - Padding=pad_attr, - NumChannels=ifm_ch, - inputDataType=dt.name, - SIMD=ifm_ch, - name="FMPadding_Batch_" + n.name, - ) - graph.node.insert(node_ind, padding_node) - - is_kernel_pointwise = k_h == 1 and k_w == 1 - is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w - is_square_kernel = k_h == k_w - is_equal_stride = stride_h == stride_w - is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or ( - k_h > 1 and k_w == 1 and ifm_dim_w == 1 - ) - - if self.use_rtl_variant: - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator_rtl", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=ifm_ch, - M=1, - parallel_window=0, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator_rtl_" + n.name, - ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) - else: - # Ensure that only supported HLS nodes are inserted - if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: - downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) - is1D_unitx = ifm_dim_w == 1 - downsample_2D = (not downsample_1D) and is_square_image and is_equal_stride - if not (downsample_1D or downsample_2D): - warnings.warn(f"Couldn't infer Downsample from {n.name},check config.") - continue - ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w) - stride = max(stride_h, stride_w) - # create DownSampler node - ConvInpGen_node = helper.make_node( - "DownSampler", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ImgDim=ConvInpGen_idim, - NumChannels=ifm_ch, - SIMD=ifm_ch, - Stride=stride, - inputDataType=dt.name, - name="DownSampler_" + n.name, - is1D=downsample_1D, - is1D_unitx=is1D_unitx, - ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) - else: - # create equivalent ConvolutionInputGenerator node - if is_square_image and is_square_kernel: # square images and square kernels - assert is_equal_stride, ( - """%s: Non-equal strides along different axes is not supported - for (non-)square convolutions""" - % n.name - ) - assert dilation_h == 1 and dilation_w == 1, ( - """%s: Dilation value != 1 is not supported - for square convolutions""" - % n.name - ) - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator_" + n.name, - ) - else: # 1D images and/or kernels - assert is_1d_convolution, ( - """%s: ConvolutionInputGenerator1D works only - for 1D convs""" - % n.name - ) - if dilation_h > 1 or dilation_w > 1: - assert depthwise == 1, ( - """%s: Dilation value > 1 is only supported for - 1D depthwise separable convolutions""" - % n.name - ) - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator1D", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator1D_" + n.name, - ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) - # remove old nodes - graph.node.remove(n) - graph_modified = True - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferUpsample(Transformation): - """ - Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour_Batch nodes. - """ - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for n in graph.node: - node_ind += 1 - if n.op_type == "Upsample" or n.op_type == "Resize": - # Extract mode and scales and input shape - mode = get_by_name(n.attribute, "mode").s.decode("ascii") - if n.op_type == "Upsample": - scales = model.get_initializer(n.input[1]) - else: - scales = model.get_initializer(n.input[2]) - in_shape = model.get_tensor_shape(n.input[0]) - - dt = model.get_tensor_datatype(n.input[0]) - if not dt.is_integer(): - warnings.warn( - "%s: Input not int. Can't infer UpsampleNearestNeighbour." % n.name - ) - continue - - if model.get_tensor_layout(n.input[0]) != DataLayout.NHWC: - warnings.warn( - "%s: Input not NHWC. Can't infer UpsampleNearestNeighbour." % n.name - ) - continue - - # Check that the parameters are okay - assert mode == "nearest", ( - "%s: Upsampling is only supported for the mode nearest." % n.name - ) - assert len(in_shape) == 4, "Upsampling is only supported for 4D inputs." - assert scales.shape == (4,), ( - "%s: Upsampling is only supported for 4D scales." % n.name - ) - assert (scales >= 1).all(), ( - n.name + ": Upsampling is only supported for scales " - "which are larger or equal 1 in all dimensions." - ) - - # Assumes nhwc layout for scales and input - is_scale_square_2d = scales[1] == scales[2] - is_scale_1d = scales[1] > 1 and scales[2] == 1 - assert is_scale_square_2d or is_scale_1d, ( - "%s: Upsampling only supported for 1D H, or 2D square scaling" % n.name - ) - assert scales[0] == scales[3] == 1, ( - n.name + ": Upsampling is only supported for scales with " - "the first and last dimensions being 1 in NHWC." - ) - spatial_scale = scales[1] - assert spatial_scale == int(spatial_scale), ( - "%s: Upsampling is only supported for integer scales." % n.name - ) - is_shape_square_2d = in_shape[1] == in_shape[2] - is_shape_1d = in_shape[1] > 1 and in_shape[2] == 1 - - assert is_shape_square_2d or is_shape_1d, ( - "%s: Upsampling is only supported for 1D H or 2D square inputs." % n.name - ) - - # Extract information for HLS node - IFMDim = in_shape[1] - OFMDim = int(round(in_shape[1] * spatial_scale)) - NumChannels = in_shape[-1] - numInputVectors = in_shape[0] - inputDataType = dt.name - dim_mode = 0 if is_shape_square_2d else 1 - - # Insert the HLSCustomOp node - Upsample_HLS_node = helper.make_node( - "UpsampleNearestNeighbour_Batch", - [n.input[0]], - [n.output[0]], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - OFMDim=OFMDim, - IFMDim=IFMDim, - NumChannels=NumChannels, - inputDataType=inputDataType, - numInputVectors=numInputVectors, - DimMode=dim_mode, - name="UpsampleNearestNeighbour_Batch_" + n.name, - ) - - # Remove the old node - graph.node.insert(node_ind, Upsample_HLS_node) - # remove old nodes - graph.node.remove(n) - graph_modified = True - return (model, graph_modified) - - -class InferStreamingMaxPool(Transformation): - """Convert MaxPoolNHWC layers to StreamingMaxPool layers.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "MaxPoolNHWC": - mp_input = node.input[0] - mp_output = node.output[0] - mp_in_shape = model.get_tensor_shape(mp_input) - # mp_out_shape = model.get_tensor_shape(mp_output) - dt = model.get_tensor_datatype(mp_input) - mp_inst = getCustomOp(node) - k_h, k_w = mp_inst.get_nodeattr("kernel_shape") - ifm_ch = mp_in_shape[-1] - ifm_dim_h = mp_in_shape[1] - ifm_dim_w = mp_in_shape[2] - pe = 1 - ceil_mode = mp_inst.get_nodeattr("ceil_mode") - is_1d = (ifm_dim_h == 1 and k_h == 1) or (ifm_dim_w == 1 and k_w == 1) - is_divisable = (ifm_dim_h % k_h == 0) or (ifm_dim_w % k_w == 0) - is_bipolar = dt == DataType["BIPOLAR"] - pass_1d = is_1d and (not is_bipolar) - pass_2d = (not is_1d) and is_divisable - if pass_1d or pass_2d: - # create equivalent StreamingMaxPool_Batch node - new_node = helper.make_node( - "StreamingMaxPool_Batch", - [mp_input], - [mp_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - PoolDim=(k_h, k_w), - NumChannels=ifm_ch, - ImgDim=(ifm_dim_h, ifm_dim_w), - dataType=dt.name, - PE=pe, - CeilMode=ceil_mode, - name="StreamingMaxPool_Batch_" + node.name, - ) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(node) - graph_modified = True - else: - warnings.warn(node.name + ": could not convert to HLS") - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferPool_Batch(Transformation): - """If kernel_shape > strides, replace Pool layer with with of Im2col - + pool(with kernel_shape == strides), plus Transpose layers to keep the original - data layout.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]: - node_input = node.input[0] - ishape = model.get_tensor_shape(node_input) - node_output = node.output[0] - idt = model.get_tensor_datatype(node_input) - oshape = model.get_tensor_shape(node_output) - # only support 4D input tensors (1D convs need extra dummy dim) - if len(ishape) != 4: - continue - - # extract pool parameters - if node.op_type == "MaxPool": - kh, kw = list(get_by_name(node.attribute, "kernel_shape").ints) - sh, sw = list(get_by_name(node.attribute, "strides").ints) - dlayout = "NCHW" - elif node.op_type == "QuantAvgPool2d": - inst = getCustomOp(node) - # QuantAvgPool2d has a single scalar attribute - # for kernel size and stride (implicit square) - kh = kw = inst.get_nodeattr("kernel") - sh = sw = inst.get_nodeattr("stride") - dlayout = inst.get_nodeattr("data_layout") - elif node.op_type == "MaxPoolNHWC": - inst = getCustomOp(node) - kh, kw = inst.get_nodeattr("kernel_shape") - sh, sw = inst.get_nodeattr("strides") - dlayout = "NHWC" - try: - pad = list(get_by_name(node.attribute, "pads").ints) - except AttributeError: - pad = [0, 0, 0, 0] - - if not idt.is_integer(): - continue - - if (kh < sh) or (kw < sw): - # TODO check/implement swg support - continue - - odt = model.get_tensor_datatype(node_output) - - if dlayout == "NCHW": - _, ifm_ch, ifm_h, ifm_w = ishape - _, ofm_ch, ofm_h, ofm_w = oshape - elif dlayout == "NHWC": - _, ifm_h, ifm_w, ifm_ch = ishape - _, ofm_h, ofm_w, ofm_ch = oshape - else: - raise Exception("Unknown dlayout: " + str(dlayout)) - - # if data layout NCHW, we need transpose nodes surrounding - # the hls layer - if dlayout == "NCHW": - # create new intermediate values - inp_trans_out = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - (1, ifm_h, ifm_w, ifm_ch), # NHWC - ) - graph.value_info.append(inp_trans_out) - inp_trans_out = inp_trans_out.name - model.set_tensor_datatype(inp_trans_out, idt) - - pool_output = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - (1, ofm_h, ofm_w, ofm_ch), - ) - graph.value_info.append(pool_output) - pool_output = pool_output.name - # model.set_tensor_datatype(pool_output, odt) - - im2col_out = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - (1, ofm_h, ofm_w, ifm_ch * kh * kw), - ) - graph.value_info.append(im2col_out) - im2col_out = im2col_out.name - model.set_tensor_datatype(im2col_out, idt) - - # create new nodes - if dlayout == "NCHW": - # NCHW -> NHWC - inp_trans_node = helper.make_node( - "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] - ) - im2col_in = inp_trans_out - else: - im2col_in = node_input - pool_output = node_output - - accum_bits = 0 - pool_size_param = 0 # will be overridden if neededs - pad_value = 0 - if node.op_type in ["MaxPool", "MaxPoolNHWC"]: - pool_fxn = "MaxPool" - odt = idt - pad_value = idt.min() - elif node.op_type == "QuantAvgPool2d": - assert odt.is_integer(), """Output data type for QuantAvgPool2d - needs to be integer""" - assert all(x == 0 for x in pad), "Padding is not supported for QuantAvgPool2d" - inst = getCustomOp(node) - pool_fxn = "QuantAvgPool" - pool_size_param = inst.get_shifts() - accum_bits = inst.get_accum_size() - - else: - raise Exception( - "pad_value and pool_fxn not configured for {}".format(node.op_type) - ) - - # format input tensor - im2col_node = helper.make_node( - "Im2Col", - [im2col_in], - [im2col_out], - domain="qonnx.custom_op.general", - stride=[sh, sw], - kernel_size=[kh, kw], - pad_amount=pad, - pad_value=pad_value, - depthwise=1, - input_shape="(1,{},{},{})".format(ifm_h, ifm_w, ifm_ch), - name="Im2Col_" + node.name, - ) - - # Warning PE has to be equal to ifm_ch until Im2Col is replaced by - # ConvolutionInputGenerator with depthwise=1. - # For other settings the output will be incorrect due to incorrect input - # data layout - pool_node = helper.make_node( - "Pool_Batch", - [im2col_out], - [pool_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - InputDataType=idt.name, - OutputDataType=odt.name, - Channels=ifm_ch, - PE=ifm_ch, - KernelSize=[kh, kw], - Function=pool_fxn, - OutImgDims=[ofm_h, ofm_w], - AccumBits=accum_bits, - Size=pool_size_param, - BatchSize=1, - name="Pool_Batch_" + node.name, - ) - - if dlayout == "NCHW": - # NHWC -> NCHW - out_trans_node = helper.make_node( - "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2] - ) - - # insert nodes where the conv is to preserve topological ordering - if dlayout == "NCHW": - graph.node.insert(node_ind, inp_trans_node) - graph.node.insert(node_ind + 1, im2col_node) - graph.node.insert(node_ind + 2, pool_node) - graph.node.insert(node_ind + 3, out_trans_node) - else: - graph.node.insert(node_ind, im2col_node) - graph.node.insert(node_ind + 1, pool_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferBinaryMatrixVectorActivation(Transformation): - """Convert XnorPopcountMatMul layers to - MatrixVectorActivation layers. Any immediately following MultiThreshold - layers will also be absorbed into the MVTU.""" - - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for n in graph.node: - node_ind += 1 - if n.op_type == "XnorPopcountMatMul": - mm_input = n.input[0] - mm_weight = n.input[1] - mm_output = n.output[0] - mm_in_shape = model.get_tensor_shape(mm_input) - mm_out_shape = model.get_tensor_shape(mm_output) - assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], ( - n.name - + """: First - input for xnorpopcount is not set to FINN DataType BINARY.""" - ) - assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], ( - n.name - + """: Second - input (weights) for xnorpopcount is not set to FINN DataType BINARY.""" - ) - idt = DataType["BINARY"] - wdt = DataType["BINARY"] - mm_output = n.output[0] - W = model.get_initializer(mm_weight) - # extract weight shape, note that ONNX and finn-hlslib - # make different assumptions about dim order here - # ONNX assumes W has (in, out) shape - # finn-hlslib assumes W has (out, in) shape - mh = int(W.shape[1]) - mw = int(W.shape[0]) - # create node with no parallelization first - pe = 1 - simd = 1 - wmem = mw * mh // (pe * simd) - assert mw * mh == wmem * pe * simd, ( - n.name - + """: Requirement (MW * MH) divisiable by - (WMEM * PE * SIMD) is violated.""" - ) - # see if we have any following thresholds - consumer = model.find_consumer(mm_output) - if consumer is not None and consumer.op_type == "MultiThreshold": - # TODO ensure integer thresholds? - # create MVTU (i.e. including activation) - mt_output = consumer.output[0] - mt_out_shape = model.get_tensor_shape(mt_output) - mt_thres = consumer.input[1] - T = model.get_initializer(mt_thres) - assert T.shape[0] == 1 or T.shape[0] == mh, ( - consumer.name - + """: First dimension of - thresholds neither 1 nor MH.""" - ) - odt = model.get_tensor_datatype(mt_output) - if odt.bitwidth() == 1: - # covers both bipolar and binary - actval = 0 - else: - actval = odt.min() - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mt_output, mt_out_shape) - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight, mt_thres], - [mt_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=actval, - binaryXnorMode=1, - noActivation=0, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name=n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(n) - graph.node.remove(consumer) - graph_modified = True - else: - # no activation, matmul only - odt = model.get_tensor_datatype(mm_output) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mm_output, mm_out_shape) - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight], - [mm_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=0, - binaryXnorMode=1, - noActivation=1, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name=n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(n) - graph_modified = True - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferQuantizedMatrixVectorActivation(Transformation): - """Convert MatMul layers with quantized inputs and weights to - MatrixVectorActivation layers. Any immediately following MultiThreshold - layers will also be absorbed into the MVTU.""" - - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for n in graph.node: - node_ind += 1 - if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None: - mm_input = n.input[0] - mm_weight = n.input[1] - mm_output = n.output[0] - mm_in_shape = model.get_tensor_shape(mm_input) - mm_out_shape = model.get_tensor_shape(mm_output) - idt = model.get_tensor_datatype(mm_input) - wdt = model.get_tensor_datatype(mm_weight) - if idt.is_integer() and wdt.is_integer(): - mm_output = n.output[0] - W = model.get_initializer(mm_weight) - # extract weight shape, note that ONNX and finn-hlslib - # make different assumptions about dim order here - # ONNX assumes W has (in, out) shape - # finn-hlslib assumes W has (out, in) shape - mh = int(W.shape[1]) - mw = int(W.shape[0]) - # create node with no parallelization first - pe = 1 - simd = 1 - wmem = mw * mh // (pe * simd) - assert mw * mh == wmem * pe * simd, ( - n.name - + """: Requirement (MW * MH) divisible by - (WMEM * PE * SIMD) is violated.""" - ) - # see if we have any following thresholds - consumer = model.find_consumer(mm_output) - if consumer is not None and consumer.op_type == "MultiThreshold": - # TODO ensure integer thresholds? - # create MVTU (i.e. including activation) - mt_output = consumer.output[0] - mt_out_shape = model.get_tensor_shape(mt_output) - mt_thres = consumer.input[1] - T = model.get_initializer(mt_thres) - assert T.shape[0] == 1 or T.shape[0] == mh, ( - consumer.name - + """: First dimension of - thresholds neither 1 nor MH.""" - ) - odt = model.get_tensor_datatype(mt_output) - scale = getCustomOp(consumer).get_nodeattr("out_scale") - actval = getCustomOp(consumer).get_nodeattr("out_bias") - assert int(actval) == actval, ( - consumer.name + ": out_bias must be integer for HLS conversion." - ) - actval = int(actval) - odt_is_bipolar = odt == DataType["BIPOLAR"] - bipolar_ok = odt_is_bipolar and (scale == 2.0) and (actval == -1) - assert scale == 1.0 or bipolar_ok, ( - consumer.name + ": out_scale=1 or bipolar output needed for conversion." - ) - assert (not odt.signed()) or (actval < 0), ( - consumer.name + ": Signed output requres actval < 0" - ) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mt_output, mt_out_shape) - if bipolar_ok: - # remove bias for bipolar, since - # binary->bipolar is achieved by reinterpretation - actval = 0 - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight, mt_thres], - [mt_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=actval, - binaryXnorMode=0, - noActivation=0, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name="MatrixVectorActivation_" + n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(n) - graph.node.remove(consumer) - graph_modified = True - else: - # no activation, matmul only - odt = model.get_tensor_datatype(mm_output) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mm_output, mm_out_shape) - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight], - [mm_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=0, - binaryXnorMode=0, - noActivation=1, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name="MatrixVectorActivation_" + n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(n) - graph_modified = True - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferVectorVectorActivation(Transformation): - """Convert MatMul layers with quantized inputs and weights to - VectorVectorActivation layers, if the sparsity annotation - of the weight matrix indicates that the MatMul layer belongs to - a depthwise convolution. Any immediately following MultiThreshold - layers will also be absorbed into the VVAU.""" - - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for n in graph.node: - node_ind += 1 - if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is not None: - sparsity = model.get_tensor_sparsity(n.input[1]) - try: - k_h, k_w = sparsity["dw"]["kernel_shape"] - except KeyError: - raise Exception( - n.name - + """: sparsity annotation doesn't indicate that MatMul - belongs to a depthwise convolution.""" - ) - - mm_input = n.input[0] - mm_weight = n.input[1] - mm_output = n.output[0] - mm_in_shape = model.get_tensor_shape(mm_input) - mm_out_shape = model.get_tensor_shape(mm_output) - idt = model.get_tensor_datatype(mm_input) - wdt = model.get_tensor_datatype(mm_weight) - if idt.is_integer() and wdt.is_integer(): - mm_output = n.output[0] - W = model.get_initializer(mm_weight) - # infer dense weight tensor from sparse weight matrix - # kernel size (k_h, k_w) which was extracted above and the value of - # the channels is used. - # the weight matrix has a shape of (k_h * k_w * Channels, Channels) - # we need to reverse the creation of the sparse weight matrix - # to achieve a weight tensor of shape (Channels, 1, k_h, k_w) - channels = int(W.shape[1]) - # transpose to achieve a shape of (k_h * k_w * Channels, Channels) - W = W.T - # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards - # to (Channels, Channels, k_h, k_w) - W = W.reshape(channels, k_h, k_w, channels) - W = W.transpose(0, 3, 1, 2) - # now we can extract the values using a for loop over the channels - # and fill a zero numpy array in the correct shape - w_tensor = np.zeros((channels, 1, k_h, k_w), dtype=np.float32) - for ch in range(channels): - w_tensor[ch][0] = W[ch][ch] - model.set_initializer(mm_weight, w_tensor) - model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w)) - # create node with pe=channels as default - pe = channels - # see if we have any following thresholds - consumer = model.find_consumer(mm_output) - if consumer is not None and consumer.op_type == "MultiThreshold": - # create VVAU (i.e. including activation) - mt_output = consumer.output[0] - mt_out_shape = model.get_tensor_shape(mt_output) - mt_thres = consumer.input[1] - T = model.get_initializer(mt_thres) - assert T.shape[0] == 1 or T.shape[0] == channels, ( - consumer.name - + """: First dimension of - thresholds neither 1 nor Channels.""" - ) - odt = model.get_tensor_datatype(mt_output) - scale = getCustomOp(consumer).get_nodeattr("out_scale") - assert scale == 1.0, ( - consumer.name + ": out_scale must be equal to 1.0 for HLS conversion." - ) - actval = getCustomOp(consumer).get_nodeattr("out_bias") - assert int(actval) == actval, ( - consumer.name + ": out_bias must be integer for HLS conversion." - ) - actval = int(actval) - assert (not odt.signed()) or (actval < 0), ( - consumer.name + ": Signed output requres actval < 0" - ) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mt_output, mt_out_shape) - # create and insert new VectorVectorActivation node - new_node = helper.make_node( - "VectorVectorActivation", - [mm_input, mm_weight, mt_thres], - [mt_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - resType="lut", - PE=pe, - Dim=[mm_in_shape[1], mm_in_shape[2]], - Channels=channels, - Kernel=[k_h, k_w], - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=actval, - noActivation=0, - name="VectorVectorActivation_" + n.name, - mem_mode=self.mem_mode, - ) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(n) - graph.node.remove(consumer) - graph_modified = True - else: - # no activation, matmul only - odt = model.get_tensor_datatype(mm_output) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mm_output, mm_out_shape) - # create and insert new VVAU node - new_node = helper.make_node( - "VectorVectorActivation", - [mm_input, mm_weight], - [mm_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - resType="lut", - PE=pe, - Dim=[mm_in_shape[1], mm_in_shape[2]], - Channels=channels, - Kernel=[k_h, k_w], - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=0, - noActivation=1, - name="VectorVectorActivation_" + n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(n) - graph_modified = True - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferThresholdingLayer(Transformation): - """Convert any MultiThreshold into a standalone thresholding HLS layer.""" - - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "MultiThreshold": - thl_input = node.input[0] - thl_threshold = node.input[1] - thl_output = node.output[0] - thl_in_shape = model.get_tensor_shape(thl_input) - thl_thres_shape = model.get_tensor_shape(thl_threshold) - idt = model.get_tensor_datatype(thl_input) - - # skip conversion for layers with float input - if not idt.is_integer(): - continue - - # check layout of inputs/outputs, and convert if needed - # check layout and convert if necessary - thl_in_layout = model.get_tensor_layout(thl_input) - if thl_in_layout == DataLayout.NCHW: - thl_input = nchw_to_nhwc(thl_input, model, node_ind) - node_ind += 1 - thl_in_shape = model.get_tensor_shape(thl_input) - - # keep track of where we need to insert the HLS Op - # it has to be ahead of the output transform - insert_point = node_ind - thl_output_layout = model.get_tensor_layout(thl_output) - if thl_output_layout == DataLayout.NCHW: - thl_output = nchw_to_nhwc(thl_output, model, node_ind, reverse=True) - node_ind += 1 - - # now safe to assume number of channels is in last dimension - ifc = int(thl_in_shape[-1]) - # create node with no parallelization first - pe = 1 - - odt = model.get_tensor_datatype(thl_output) - scale = getCustomOp(node).get_nodeattr("out_scale") - assert scale == 1.0, ( - node.name + ": MultiThreshold out_scale must be 1 for HLS conversion." - ) - actval = getCustomOp(node).get_nodeattr("out_bias") - assert int(actval) == actval, ( - node.name + ": MultiThreshold out_bias must be integer for HLS conversion." - ) - actval = int(actval) - assert (not odt.signed()) or (actval < 0), ( - node.name + ": Signed output requres actval < 0" - ) - # create and insert new Thresholding_Batch node - new_node = helper.make_node( - "Thresholding_Batch", - [thl_input, thl_threshold], - [thl_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=ifc, - PE=pe, - numSteps=thl_thres_shape[1], - inputDataType=idt.name, - # weightDataType can be tightened by MinimizeAccumulatorWidth - weightDataType=idt.name, - outputDataType=odt.name, - numInputVectors=list(thl_in_shape[:-1]), - ActVal=actval, - mem_mode=self.mem_mode, - name="Thresholding_Batch_" + node.name, - ) - graph.node.insert(insert_point, new_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferAddStreamsLayer(Transformation): - """Convert any Add into a AddStreams HLS layer.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "Add": - in0 = node.input[0] - in1 = node.input[1] - result = node.output[0] - in0_shape = model.get_tensor_shape(in0) - in1_shape = model.get_tensor_shape(in1) - in0_static = not (model.get_initializer(in0) is None) - in1_static = not (model.get_initializer(in1) is None) - - # skip if different shapes on inputs - if in0_shape != in1_shape: - continue - # skip if any of inputs have initializers - # (this node is meant for adding two dynamic streams) - if in0_static or in1_static: - continue - - idt0 = model.get_tensor_datatype(in0) - idt1 = model.get_tensor_datatype(in1) - - # skip if different data types on inputs - if idt0 != idt1: - continue - - idt = idt0 - - # skip conversion for layers with float input - if not idt.is_integer(): - continue - - # check layout and convert if necessary - in0_layout = model.get_tensor_layout(in0) - in1_layout = model.get_tensor_layout(in1) - result_layout = model.get_tensor_layout(result) - - if in0_layout == DataLayout.NCHW: - in0 = nchw_to_nhwc(in0, model, node_ind) - node_ind += 1 - in0_shape = model.get_tensor_shape(in0) - - if in1_layout == DataLayout.NCHW: - in1 = nchw_to_nhwc(in1, model, node_ind) - node_ind += 1 - in1_shape = model.get_tensor_shape(in1) - - # keep track of where we need to insert the HLS Op - # it has to be ahead of the output transform - insert_point = node_ind - - if result_layout == DataLayout.NCHW: - result = nchw_to_nhwc(result, model, node_ind, reverse=True) - node_ind += 1 - - # now safe to assume num_channels is size of last dimension - num_channels = int(in0_shape[-1]) - # create node with no parallelization first - pe = 1 - - # create and insert new AddStreams_Batch node - new_node = helper.make_node( - "AddStreams_Batch", - [in0, in1], - [result], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=num_channels, - PE=pe, - inputDataType=idt.name, - numInputVectors=in0_shape[:-1], - name="AddStreams_Batch_" + node.name, - ) - graph.node.insert(insert_point, new_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferDuplicateStreamsLayer(Transformation): - """Insert a DuplicateStreams HLS layer for any tensor with fanout == 2""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - successors = model.find_consumers(node.output[0]) - if successors is not None and len(successors) >= 2: - output_tensor = node.output[0] - n_outputs = len(successors) - - dt = model.get_tensor_datatype(output_tensor) - - # skip conversion for layers with float input - if not dt.is_integer(): - continue - - # create clone tensors - out_shape = model.get_tensor_shape(output_tensor) - out_tensor_clones = [] - for i in range(n_outputs): - clone = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape - ) - model.graph.value_info.append(clone) - out_tensor_clones += [clone.name] - - num_ch = int(out_shape[-1]) - vecs = out_shape[:-1] - - # create node with no parallelization first - pe = 1 - - dup_node = helper.make_node( - "DuplicateStreams_Batch", - [output_tensor], - out_tensor_clones, - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=num_ch, - PE=pe, - inputDataType=dt.name, - numInputVectors=vecs, - NumOutputStreams=n_outputs, - outFIFODepths=[2] * n_outputs, - name="DuplicateStreams_Batch_" + node.name, - ) - - graph.node.insert(node_ind, dup_node) - - # connect successors to out tensor clone - clone_idx = 0 - for successor in successors: - for i, succ_input in enumerate(successor.input): - if succ_input == output_tensor: - successor.input[i] = out_tensor_clones[clone_idx] - clone_idx += 1 - # if one node has multiple connections to the same output - # find_direct_successors will return one node per input - # so break the inner loop will result in correct behaviour - break - - graph_modified = True - - if graph_modified: - model = model.transform(SortGraph()) - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferChannelwiseLinearLayer(Transformation): - """Convert any channel-wise Add/Mul into a HLS layer.""" - - def get_smallest_possible(self, vals): - """Returns smallest (fewest bits) possible DataType that can represent - value. Prefers unsigned integers where possible.""" - vals = np.array(vals, dtype=np.float64) - for v in vals: - assert int(v) == v, "Error float value" - - for k in DataType.get_accumulator_dt_cands(): - dt = DataType[k] - - if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: - # not currently supported - continue - - if (dt.min() <= vals).all() and (vals <= dt.max()).all(): - return dt - - warnings.warn( - """InferChannelwiseLinearLayer: Output values may not be - representable with supported data types. - Setting maximum width data type available. - This will lead to errors if there are no constrains on the input - """ - ) - - if (0 <= vals).all(): - return DataType["UINT64"] - else: - return DataType["INT64"] - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "Add" or node.op_type == "Mul": - # assuming input[0] is dynamic - ll_input = node.input[0] - ll_output = node.output[0] - ll_in_shape = model.get_tensor_shape(ll_input) - - # check if input 1 has an initializer - ll_const = node.input[1] - if ll_const is not None: - ll_cinit = model.get_initializer(ll_const) - if ll_cinit is None: - # input 1 is also dynamic - continue - else: - continue - - # get number of channels and channel index from input - ll_in_layout = model.get_tensor_layout(ll_input) - if ll_in_layout == DataLayout.NHWC or ll_in_layout == DataLayout.NC: - ch_index = -1 - ch = ll_in_shape[-1] - elif ll_in_layout == DataLayout.NCHW: - ch_index = 1 - ch = ll_in_shape[1] - else: - continue - - # check if the shape of initializer is compatible - ll_cinit_shape = list(ll_cinit.shape) - if np.prod(ll_cinit_shape) == 1: - warnings.warn("Broadcasting " + str(node.op_type) + "(" + node.name + ")") - ll_cinit = np.full((ch), ll_cinit.flatten()[0]) - elif np.prod(ll_cinit_shape) != ch or ll_cinit_shape[ch_index] != ch: - # parameter shape not compatible with Channelwise_batch - continue - - # check initializer contains integers as floats - if not (ll_cinit.astype(np.int32) == ll_cinit).all(): - continue - # all initializer conditions are met - - # check inputs - idt = model.get_tensor_datatype(ll_input) - if not idt.is_integer(): - # skip conversion for layers with float input - continue - - # check layout of inputs/outputs, and convert if needed - # check layout and convert if necessary - if ll_in_layout == DataLayout.NCHW: - ll_input = nchw_to_nhwc(ll_input, model, node_ind) - node_ind += 1 - ll_in_shape = model.get_tensor_shape(ll_input) - - # keep track of where we need to insert the HLS Op - # it has to be ahead of the output transform - insert_point = node_ind - ll_output_layout = model.get_tensor_layout(ll_output) - if ll_output_layout == DataLayout.NCHW: - ll_output = nchw_to_nhwc(ll_output, model, node_ind, reverse=True) - node_ind += 1 - - # get parameter data type - param_min = min(ll_cinit.flatten()) - param_max = max(ll_cinit.flatten()) - pdt = self.get_smallest_possible([param_min, param_max]) - - # set function and determine output data type - if node.op_type == "Add": - func = "add" - out_min = idt.min() + param_min - out_max = idt.max() + param_max - odt = self.get_smallest_possible([out_min, out_max]) - elif node.op_type == "Mul": - func = "mul" - possible_limits = [] - possible_limits += [idt.min() * param_min] - possible_limits += [idt.min() * param_max] - possible_limits += [idt.max() * param_min] - possible_limits += [idt.max() * param_max] - odt = self.get_smallest_possible(possible_limits) - - model.set_initializer(ll_const, ll_cinit.reshape(ch)) - model.set_tensor_datatype(ll_output, odt) - - # create node with no parallelization first - pe = 1 - assert ch % pe == 0, "Requirement IFC divisable by PE is violated." - # create and insert node - new_node = helper.make_node( - "ChannelwiseOp_Batch", - [ll_input, ll_const], - [ll_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - Func=func, - NumChannels=ch, - PE=pe, - inputDataType=idt.name, - paramDataType=pdt.name, - outputDataType=odt.name, - numInputVectors=list(ll_in_shape[:-1]), - name="ChannelwiseOp_Batch_" + node.name, - ) - graph.node.insert(insert_point, new_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferLabelSelectLayer(Transformation): - """Convert any TopK into a LabelSelect HLS layer.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "TopK": - fc_input = node.input[0] - k_input = node.input[1] - val_output = node.output[0] - idx_output = node.output[1] - fc_in_shape = model.get_tensor_shape(fc_input) - - idt = model.get_tensor_datatype(fc_input) - - # skip conversion for layers with float input - if not idt.is_integer(): - continue - - # skip conversion for if value output is connected (not supported) - if model.find_consumer(val_output) is not None: - continue - - num_labels = int(fc_in_shape[-1]) - num_inp_vecs = list(fc_in_shape[:-1]) - # create node with no parallelization first - pe = 1 - - k = model.get_initializer(k_input)[0] - - # create and insert new LabelSelect_Batch node - new_node = helper.make_node( - "LabelSelect_Batch", - [fc_input], - [idx_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - Labels=num_labels, - PE=pe, - K=k, - inputDataType=idt.name, - numInputVectors=num_inp_vecs, - name="LabelSelect_Batch_" + node.name, - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferGlobalAccPoolLayer(Transformation): - """Convert any GlobalAveragePool into a GlobalAccPool HLS layer and a scalar Mul.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "GlobalAveragePool": - in0 = node.input[0] - result = node.output[0] - in0_shape = model.get_tensor_shape(in0) - - idt = model.get_tensor_datatype(in0) - - # skip conversion for layers with float input - if not idt.is_integer(): - continue - - # check layout and convert if necessary - in0_layout = model.get_tensor_layout(in0) - result_layout = model.get_tensor_layout(result) - - if in0_layout == DataLayout.NCHW: - in0 = nchw_to_nhwc(in0, model, node_ind) - node_ind += 1 - in0_shape = model.get_tensor_shape(in0) - - # keep track of where we need to insert the HLS Op - # it has to be ahead of the output transform - insert_point = node_ind - - if result_layout == DataLayout.NCHW: - result = nchw_to_nhwc(result, model, node_ind, reverse=True) - node_ind += 1 - - num_ch = int(in0_shape[-1]) - vecs = in0_shape[:-1] - # create node with no parallelization first - pe = 1 - - # create an additional tensor of the same shape and layout as result - out_shape = model.get_tensor_shape(result) - pool_out = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape - ) - model.graph.value_info.append(pool_out) - pool_out = pool_out.name - model.set_tensor_layout(pool_out, model.get_tensor_layout(result)) - - new_pool = helper.make_node( - "GlobalAccPool_Batch", - [in0], - [pool_out], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=num_ch, - PE=pe, - inputDataType=idt.name, - numInputVectors=vecs, - name="GlobalAccPool_Batch_" + node.name, - ) - - mul_value = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, [1] - ) - model.graph.value_info.append(mul_value) - model.set_initializer( - mul_value.name, np.array(1 / (vecs[1] * vecs[2]), dtype=np.float32) - ) - new_mul = helper.make_node( - "Mul", - [pool_out, mul_value.name], - [result], - ) - graph.node.insert(insert_point, new_pool) - graph.node.insert(insert_point + 1, new_mul) - node_ind += 1 - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferLookupLayer(Transformation): - """Convert Gather nodes with constant op0 into Lookup HLS layers.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "Gather": - emb_name = node.input[0] - embs = model.get_initializer(emb_name) - axis = get_by_name(node.attribute, "axis") - # skip conversion if input0 is not constant - if embs is None: - continue - # skip conversion if axis != 0 - if axis is not None and axis.i != 0: - continue - ind_name = node.input[1] - ind_dtype = model.get_tensor_datatype(ind_name) - emb_dtype = model.get_tensor_datatype(emb_name) - # skip conversion if inputs are not unsigned integers - if (not ind_dtype.is_integer()) or ind_dtype.signed(): - continue - num_embs, emb_dim = embs.shape - out_name = node.output[0] - ishape = model.get_tensor_shape(node.input[1]) - # create and insert new Lookup node - new_node = helper.make_node( - "Lookup", - [ind_name, emb_name], - [out_name], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - name="Lookup_" + node.name, - NumEmbeddings=num_embs, - EmbeddingDim=emb_dim, - EmbeddingType=emb_dtype.name, - InputType=ind_dtype.name, - InputShape=list(ishape), - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferConcatLayer(Transformation): - """Convert suitable Concat nodes (operating on last/-1 axis) - into StreamingConcat HLS layers.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "Concat": - ishape = model.get_tensor_shape(node.input[0]) - axis = get_by_name(node.attribute, "axis") - if (axis is None) or (ishape is None): - continue - axis = axis.i - last_axis = len(ishape) - 1 - # skip conversion if not using last axis - if (axis != -1) and (axis != last_axis): - continue - # check datatype coherence - dt0 = model.get_tensor_datatype(node.input[0]) - if dt0 is None: - continue - dt_coherent = all([model.get_tensor_datatype(x) == dt0 for x in node.input]) - if not dt_coherent: - continue - # skip conversion if any inputs are static - all_static = all([model.get_initializer(x) is None for x in node.input]) - if not all_static: - continue - # skip conversion if inputs are not integers - if not dt0.is_integer(): - continue - # ready for conversion - elems_per_stream = [model.get_tensor_shape(x)[-1] for x in node.input] - inp_vec = list(model.get_tensor_shape(node.input[0])[:-1]) - new_node = helper.make_node( - "StreamingConcat", - node.input, - node.output, - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - name="Concat_" + node.name, - ElemsPerStream=elems_per_stream, - inputDataType=dt0.name, - numInputVectors=inp_vec, - inFIFODepths=[2] * len(node.input), - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferStreamingEltwise(Transformation): - """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer - with SubEltwise or AbsDiffEltwise op.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "Sub": - in0 = node.input[0] - in1 = node.input[1] - result = node.output[0] - in0_shape = model.get_tensor_shape(in0) - in1_shape = model.get_tensor_shape(in1) - in0_static = not (model.get_initializer(in0) is None) - in1_static = not (model.get_initializer(in1) is None) - - # skip if different shapes on inputs - if in0_shape != in1_shape: - continue - # skip if any of inputs have initializers - # (this node is meant for two dynamic streams) - if in0_static or in1_static: - continue - - idt0 = model.get_tensor_datatype(in0) - idt1 = model.get_tensor_datatype(in1) - - # skip conversion for layers with float input - if not (idt0.is_integer() and idt1.is_integer()): - continue - - eltwiseOp = "Sub" - nodes_to_remove = [node] - # look for a downstream Abs node - res_consumer = model.find_consumer(result) - if (res_consumer is not None) and (res_consumer.op_type == "Abs"): - eltwiseOp = "AbsDiff" - result = res_consumer.output[0] - nodes_to_remove.append(res_consumer) - - # check layout and convert if necessary - in0_layout = model.get_tensor_layout(in0) - in1_layout = model.get_tensor_layout(in1) - result_layout = model.get_tensor_layout(result) - - if in0_layout == DataLayout.NCHW: - in0 = nchw_to_nhwc(in0, model, node_ind) - node_ind += 1 - in0_shape = model.get_tensor_shape(in0) - - if in1_layout == DataLayout.NCHW: - in1 = nchw_to_nhwc(in1, model, node_ind) - node_ind += 1 - in1_shape = model.get_tensor_shape(in1) - - # keep track of where we need to insert the HLS Op - # it has to be ahead of the output transform - insert_point = node_ind - - if result_layout == DataLayout.NCHW: - result = nchw_to_nhwc(result, model, node_ind, reverse=True) - node_ind += 1 - - # now safe to assume num_channels is size of last dimension - num_channels = int(in0_shape[-1]) - # create node with no parallelization first - pe = 1 - - # create and insert new Eltwise node - new_node = helper.make_node( - "StreamingEltwise", - [in0, in1], - [result], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=num_channels, - PE=pe, - inputDataType0=idt0.name, - inputDataType1=idt1.name, - eltwiseOp=eltwiseOp, - numInputVectors=in0_shape[:-1], - name="StreamingEltwise_" + node.name, - ) - graph.node.insert(insert_point, new_node) - # remove old nodes - for nd in nodes_to_remove: - graph.node.remove(nd) - graph_modified = True - - # if graph_modified: - # model = model.transform(InferShapes()) - # model = model.transform(InferDataTypes()) - return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 1c316e1285..0ce0923934 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -285,14 +285,14 @@ def apply(self, model): ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream") if self.signature: ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/axi_info") - if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA_hls"]: + if model.graph.node[0].op_type not in ["StreamingFIFO_rtl", "IODMA_hls"]: warnings.warn( """First node is not StreamingFIFO or IODMA. You may experience incorrect stitched-IP rtlsim or hardware behavior. It is strongly recommended to insert FIFOs prior to calling CreateStitchedIP.""" ) - if model.graph.node[0].op_type == "StreamingFIFO": + if model.graph.node[0].op_type == "StreamingFIFO_rtl": firstfifo = getCustomOp(model.graph.node[0]) if firstfifo.get_nodeattr("impl_style") == "vivado": warnings.warn( @@ -349,7 +349,7 @@ def apply(self, model): if self.signature: # extract number of checksum layer from graph - checksum_layers = model.get_nodes_by_op_type("checksum") + checksum_layers = model.get_nodes_by_op_type("CheckSum_hls") self.insert_signature(len(checksum_layers)) # create a temporary folder for the project diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py index d5699e4dc6..dee9b62e67 100644 --- a/src/finn/transformation/fpgadataflow/derive_characteristic.py +++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py @@ -75,7 +75,7 @@ def apply(self, model: ModelWrapper): return (model, run_again) # apply manual fix for DuplicateStreams and AddStreams for # simple residual reconvergent paths with bypass - addstrm_nodes = model.get_nodes_by_op_type("AddStreams_Batch") + addstrm_nodes = model.get_nodes_by_op_type("AddStreams_hls") for addstrm_node in addstrm_nodes: # we currently only support the case where one branch is # a bypass @@ -84,8 +84,8 @@ def apply(self, model: ModelWrapper): if (b0 is None) or (b1 is None): warnings.warn("Found unsupported AddStreams, skipping") return (model, run_again) - b0_is_bypass = b0.op_type == "DuplicateStreams_Batch" - b1_is_bypass = b1.op_type == "DuplicateStreams_Batch" + b0_is_bypass = b0.op_type == "DuplicateStreams_hls" + b1_is_bypass = b1.op_type == "DuplicateStreams_hls" if (not b0_is_bypass) and (not b1_is_bypass): warnings.warn("Found unsupported AddStreams, skipping") return (model, run_again) diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index 6149dffd59..5231fc288b 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -95,7 +95,7 @@ def apply(self, model): narrow_neighbour = model.find_producer(node.input[0]) node_slr = getCustomOp(narrow_neighbour).get_nodeattr("slr") node_inst.set_nodeattr("slr", node_slr) - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): # if we have SLR assignment already. use that if node_slr != -1: continue diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index f6dd587c76..e3a52f68f0 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -90,7 +90,7 @@ def apply(self, model): if ( consumer.op_type.startswith("MatrixVectorActivation") and n1.get_nodeattr("mem_mode") == "external" - ) or (consumer.op_type == "StreamingConcat"): + ) or (consumer.op_type.startswith("StreamingConcat")): # get input idx in_idx = None for idx, n_input in enumerate(consumer.input): diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 4efadf0f27..630310842c 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -37,7 +37,7 @@ def _is_fifo_node(node): - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): return True else: return False @@ -184,7 +184,10 @@ def apply(self, model): for graph_in_name in graph_in_names: first_node = model.find_consumer(graph_in_name) # insert FIFO as first node, except when first node is DMA - if first_node.op_type != "StreamingFIFO" and first_node.op_type != "IODMA_hls": + if ( + not first_node.op_type.startswith("StreamingFIFO") + and first_node.op_type != "IODMA_hls" + ): inp_ind = list(first_node.input).index(graph_in_name) n_input = first_node.input[inp_ind] n0 = getCustomOp(first_node) @@ -238,7 +241,10 @@ def apply(self, model): graph_out_names = [x.name for x in model.graph.output] for graph_out_name in graph_out_names: final_node = model.find_producer(graph_out_name) - if final_node.op_type != "StreamingFIFO" and final_node.op_type != "IODMA_hls": + if ( + not final_node.op_type.startswith("StreamingFIFO") + and final_node.op_type != "IODMA_hls" + ): assert ( final_node.op_type != "TLastMarker_hls" ), """Insert tlast marker should be done diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index fbb64428aa..431ca8e0b5 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -119,7 +119,7 @@ def apply(self, model): if inp_idx > 0: if first_node.op_type.startswith("MatrixVectorActivation") and inp_idx == 1: stream_width = int(custom_op.get_weightstream_width()) - elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1: + elif first_node.op_type.startswith("AddStreams") and inp_idx == 1: stream_width = int(custom_op.get_instream_width()) else: raise Exception("No method to determine stream width") diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 75c35df7d7..5d3b42b0c0 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -103,7 +103,7 @@ def apply(self, model): else: is_first_node = True if ( - node.op_type == "StreamingFIFO" + node.op_type.startswith("StreamingFIFO") and getCustomOp(node).get_nodeattr("depth") <= self.shallow_threshold and (not is_first_node) ): @@ -167,12 +167,12 @@ def apply(self, model): for node in model.graph.node: # look for following pattern: # ConvolutionInputGenerator -> StreamingFIFO -> MatrixVectorActivation - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): fifo_prod = model.find_producer(node.input[0]) fifo_cons = model.find_consumer(node.output[0]) if fifo_prod is None: continue - if fifo_prod.op_type != "ConvolutionInputGenerator": + if not fifo_prod.op_type.startswith("ConvolutionInputGenerator"): continue if fifo_cons is None: continue @@ -266,7 +266,8 @@ def apply(self, model): for node in model.graph.node: # verify assumptions assert is_fpgadataflow_node(node), "Found non-fpgadataflow node: " + str(node) - assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" + op_type = node.op_type + assert not op_type.startswith("StreamingFIFO"), "Found existing StreamingFIFO node" node = getCustomOp(node) ifd = node.get_nodeattr("inFIFODepths") ofd = node.get_nodeattr("outFIFODepths") @@ -283,7 +284,7 @@ def apply(self, model): node.set_nodeattr("inFIFODepths", ifd) node.set_nodeattr("outFIFODepths", ofd) - if node.op_type in extw_optypes: + if op_type in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) @@ -376,7 +377,9 @@ def apply(self, model): else: # do rtlsim in C++ for FIFO sizing # determine # inputs for FIFO sizing according to topology type - swg_nodes = [x for x in model.graph.node if "ConvolutionInputGenerator" in x.op_type] + swg_nodes = [ + x for x in model.graph.node if x.op_type.startswith("ConvolutionInputGenerator") + ] if len(swg_nodes) == 0: # MLP, no layer overlap # assuming half the nodes are now FIFOs, use half the # of @@ -400,7 +403,7 @@ def apply(self, model): for node in model.graph.node: # set FIFO depth, reset FIFO implementation, # and set implementation/ram styles - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): assert node.name in fifos, "FIFO node not found in size dictionary" # set depth of FIFO depth = optimize_depth(fifos[node.name]) @@ -444,7 +447,7 @@ def apply(self, model): # reflect final values in attributes for node in model.graph.node: - if node.op_type != "StreamingFIFO": + if not node.op_type.startswith("StreamingFIFO"): node_inst = getCustomOp(node) fifodepth_in = [] for node_inp in node.input: @@ -459,7 +462,7 @@ def apply(self, model): pass else: # there is a producer for this input - if prod.op_type == "StreamingFIFO": + if prod.op_type.startswith("StreamingFIFO"): prod_inst = getCustomOp(prod) fifodepth_in.append(prod_inst.get_nodeattr("depth")) else: @@ -478,7 +481,7 @@ def apply(self, model): pass else: # there is a consumer for this input - if cons.op_type == "StreamingFIFO": + if cons.op_type.startswith("StreamingFIFO"): cons_inst = getCustomOp(cons) fifodepth_out.append(cons_inst.get_nodeattr("depth")) else: @@ -565,7 +568,7 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): n_inst = getCustomOp(node) depth = n_inst.get_nodeattr("depth") cfgs = get_fifo_split_configs(depth, self.max_qsrl_depth, self.max_vivado_depth) diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 62457f164a..83f4138668 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -99,33 +99,32 @@ def apply(self, model): graph = model.graph # these ops use PE parallelism, up to a max value of NumChannels pe_ops = [ - "AddStreams_Batch", - "ChannelwiseOp_Batch", - "DuplicateStreams_Batch", - "GlobalAccPool_Batch", - "Thresholding_Batch", + "AddStreams_hls", + "ChannelwiseOp_hls", + "DuplicateStreams_hls", + "GlobalAccPool_hls", + "Thresholding_hls", ] # these ops use SIMD parallelism, up to a max value of NumChannels # ConvolutionInputGenerator* has a special case when depthwise=1 # ConvolutionInputGenerator_rtl supports additional parallelism by # setting parallel_window=1 mode after maxing out SIMD simd_ops = [ - "DownSampler", - "FMPadding_Batch", - "FMPadding_Pixel", - "ConvolutionInputGenerator", - "ConvolutionInputGenerator1D", + "DownSampler_hls", + "FMPadding_hls", + "FMPadding_Pixel_hls", + "ConvolutionInputGenerator_hls", "ConvolutionInputGenerator_rtl", ] # these ops are preceded by depthwise SWG and have special behavior, # as explained in the SetFolding docstring - depthwise_op_exceptions = ["VectorVectorActivation", "Pool_Batch"] + depthwise_op_exceptions = ["VectorVectorActivation_hls", "Pool_hls"] for node in graph.node: if not is_fpgadataflow_node(node): continue op_type = node.op_type node_inst = getCustomOp(node) - if op_type.startswith("MatrixVectorActivation"): + if op_type == "MatrixVectorActivation_hls": max_simd = node_inst.get_nodeattr("MW") max_pe = node_inst.get_nodeattr("MH") node_inst.set_nodeattr("PE", 1) @@ -152,12 +151,12 @@ def apply(self, model): elif op_type in pe_ops: max_pe = node_inst.get_nodeattr("NumChannels") self.optimize_attribute_val(node_inst, max_pe, "PE") - elif op_type == "LabelSelect_Batch": + elif op_type == "LabelSelect_hls": max_pe = node_inst.get_nodeattr("Labels") self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type in depthwise_op_exceptions: # init/reset SIMD of VVAU - if op_type == "VectorVectorActivation": + if op_type == "VectorVectorActivation_hls": node_inst.set_nodeattr("SIMD", 1) max_pe = node_inst.get_nodeattr("Channels") self.optimize_attribute_val(node_inst, max_pe, "PE") @@ -165,7 +164,7 @@ def apply(self, model): pe = node_inst.get_nodeattr("PE") cyc = node_inst.get_exp_cycles() if ( - op_type == "VectorVectorActivation" + op_type == "VectorVectorActivation_hls" and pe == max_pe and cyc > self.target_cycles_per_frame ): @@ -187,9 +186,9 @@ def apply(self, model): else: swu_node_inst.set_nodeattr("parallel_window", 0) else: - if op_type == "VectorVectorActivation": + if op_type == "VectorVectorActivation_hls": ksize = np.prod(node_inst.get_nodeattr("Kernel")) - elif op_type == "Pool_Batch": + elif op_type == "Pool_hls": ksize = node_inst.get_nodeattr("KernelSize") else: raise Exception("Undefined edge case for %s" % op_type) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index d06f7d524e..691d7aed34 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -30,8 +30,6 @@ from onnx import helper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation -from qonnx.transformation.infer_datatypes import InferDataTypes -from qonnx.transformation.infer_shapes import InferShapes from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants @@ -225,7 +223,4 @@ def apply(self, model): # remove old nodes graph.node.remove(node) graph_modified = True - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) return (model, graph_modified) diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py index ed553e7cee..6b5fa5516f 100644 --- a/src/finn/transformation/move_reshape.py +++ b/src/finn/transformation/move_reshape.py @@ -50,7 +50,7 @@ def apply(self, model): producer = model.find_producer(transp_node.input[0]) if _is_fpgadataflow_node(producer) is True: consumer = model.find_consumer(n.output[0]) - if consumer.op_type == "MatrixVectorActivation": + if consumer.op_type.startswith("MatrixVectorActivation"): fc_inst = getCustomOp(consumer) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py index 318ba7045e..7486402be5 100644 --- a/src/finn/util/pyverilator.py +++ b/src/finn/util/pyverilator.py @@ -147,7 +147,7 @@ def verilator_fifosim(model, n_inputs, max_iters=100000000): fifo_log = [] fifo_log_templ = ' results_file << "maxcount%s" << "\\t" ' fifo_log_templ += "<< to_string(top->maxcount%s) << endl;" - fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO") + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO_rtl") fifo_ind = 0 for fifo_node in fifo_nodes: fifo_node = getCustomOp(fifo_node) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 53e5bb85eb..1dab57a7d3 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -704,9 +704,6 @@ def test_set_fifo_depths(self, topology, wbits, abits, board): model = load_test_checkpoint_or_skip(prev_chkpt_name) test_fpga_part = get_build_env(board, target_clk_ns)["part"] model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns)) - fifo_layers = model.get_nodes_by_op_type("StreamingFIFO") - assert len(fifo_layers) > 0 - model = model.transform(SpecializeLayers()) fifo_layers = model.get_nodes_by_op_type("StreamingFIFO_rtl") assert len(fifo_layers) > 0 model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board)) diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py index 3649d6709e..c8f80a8e1b 100644 --- a/tests/util/test_build_dataflow.py +++ b/tests/util/test_build_dataflow.py @@ -50,6 +50,7 @@ def test_end2end_build_dataflow_directory(): assert os.path.isfile(output_dir + "/time_per_step.json") assert os.path.isfile(output_dir + "/auto_folding_config.json") assert os.path.isfile(output_dir + "/final_hw_config.json") + assert os.path.isfile(output_dir + "/template_specialize_layers_config.json") assert os.path.isfile(output_dir + "/stitched_ip/ip/component.xml") assert os.path.isfile(output_dir + "/driver/driver.py") assert os.path.isfile(output_dir + "/report/estimate_layer_cycles.json") From 7c3ccd33eae027afbd521fcf49af218f14c5959c Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 9 Feb 2024 16:30:02 +0000 Subject: [PATCH 122/291] [Tests] Change cnv dictionary for bnn pynq test --- tests/end2end/test_end2end_bnn_pynq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 1dab57a7d3..d95cc1dc4c 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -625,7 +625,7 @@ def test_specialize_layers(self, topology, wbits, abits, board): "cnv": [ ("Transpose", 1), ("Thresholding_hls", 1), - ("ConvolutionInputGenerator_rtl", 6), + ("ConvolutionInputGenerator_hls", 6), ("MatrixVectorActivation_hls", 9), ("StreamingMaxPool_hls", 2), ("LabelSelect_hls", 1), From 64c0c7d4509c8e17c3b9da174280eac8a07d74ed Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 9 Feb 2024 16:52:38 +0000 Subject: [PATCH 123/291] [Tests] Update folding test --- tests/fpgadataflow/test_set_folding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py index ce9f4b12ed..4992bf59f8 100644 --- a/tests/fpgadataflow/test_set_folding.py +++ b/tests/fpgadataflow/test_set_folding.py @@ -64,10 +64,10 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes): simd = 1 FCLayer_nodes += [ helper.make_node( - "MatrixVectorActivation", + "MatrixVectorActivation_hls", [tensors[i].name, "weights_" + str(i), "thresh_" + str(i)], [tensors[i + 1].name], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=ch, MH=ch, From ba56a2d9bd63eded63e32ed039ef3b5c35dc1394 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 12 Feb 2024 12:13:48 +0000 Subject: [PATCH 124/291] [Tests] Update fifo and ipstitch test to new flow --- tests/fpgadataflow/test_fpgadataflow_fifo.py | 4 +++- .../fpgadataflow/test_fpgadataflow_ipstitch.py | 17 +++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py index ecbf867b69..1719da1454 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fifo.py +++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -59,6 +60,7 @@ def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype): backend="fpgadataflow", depth=Depth, folded_shape=fld_shape, + normal_shape=Shape, dataType=str(finn_dtype.name), ) diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index aedb151af9..846f2c1fe0 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -78,10 +79,10 @@ def create_one_fc_model(mem_mode="const"): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MatrixVectorActivation", + "MatrixVectorActivation_hls", ["inp", "w0"], ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -130,10 +131,10 @@ def create_two_fc_model(mem_mode="decoupled"): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MatrixVectorActivation", + "MatrixVectorActivation_hls", ["inp", "w0"], ["mid"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -149,10 +150,10 @@ def create_two_fc_model(mem_mode="decoupled"): ) fc1 = helper.make_node( - "MatrixVectorActivation", + "MatrixVectorActivation_hls", ["mid", "w1"], ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -208,7 +209,7 @@ def test_fpgadataflow_ipstitch_gen_model(mem_mode): model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, 5)) model = model.transform(HLSSynthIP()) - assert model.graph.node[0].op_type == "MatrixVectorActivation" + assert model.graph.node[0].op_type == "MatrixVectorActivation_hls" assert model.graph.node[-1].op_type == "TLastMarker_hls" model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model_%s.onnx" % mem_mode) From 79ef071995cf4d3c20b2687a03e5d3e461cb11dd Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 13 Feb 2024 11:28:12 +0000 Subject: [PATCH 125/291] [CustomOp] Fix typo in HLS SWG LUT estimation --- .../custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py index 585f152550..4a5c02ee06 100644 --- a/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py @@ -241,7 +241,7 @@ def bram_estimation(self): def lut_estimation(self): simd = self.get_nodeattr("SIMD") - is1D = self.get_noadeattr("is1D") + is1D = self.get_nodeattr("is1D") if not is1D: ifm_ch = self.get_nodeattr("IFMChannels") ifm_dim = self.get_nodeattr("IFMDim")[0] From 5b10b9878caf9dbb226e2e110e3db58a9c54dd7c Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 13 Feb 2024 11:28:48 +0000 Subject: [PATCH 126/291] [Tests] Update cybsec mlp test to new flow --- tests/end2end/test_end2end_cybsec_mlp.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py index 1cd38eb83a..b58b9f472c 100644 --- a/tests/end2end/test_end2end_cybsec_mlp.py +++ b/tests/end2end/test_end2end_cybsec_mlp.py @@ -168,6 +168,7 @@ def test_end2end_cybsec_mlp_build(): # check the generated files assert os.path.isfile(output_dir + "/time_per_step.json") assert os.path.isfile(output_dir + "/final_hw_config.json") + assert os.path.isfile(output_dir + "/template_specialize_layers_config.json") assert os.path.isfile(output_dir + "/driver/driver.py") est_cycles_report = output_dir + "/report/estimate_layer_cycles.json" assert os.path.isfile(est_cycles_report) @@ -181,8 +182,8 @@ def test_end2end_cybsec_mlp_build(): # examine the report contents with open(est_cycles_report, "r") as f: est_cycles_dict = json.load(f) - assert est_cycles_dict["MatrixVectorActivation_0"] == 80 - assert est_cycles_dict["MatrixVectorActivation_1"] == 64 + assert est_cycles_dict["MatrixVectorActivation_hls_0"] == 80 + assert est_cycles_dict["MatrixVectorActivation_hls_1"] == 64 with open(est_res_report, "r") as f: est_res_dict = json.load(f) assert est_res_dict["total"]["LUT"] == 7899.0 From 100d2812be58299f4fd38a7c46bd5d4a92cf48f2 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 13 Feb 2024 15:45:26 +0000 Subject: [PATCH 127/291] [hw mvau]: remove dsp/lut estimation functions, modified how ip gets stitched in and bugfix to execution of 2D tensors --- .../fpgadataflow/matrixvectoractivation.py | 99 ++----------------- 1 file changed, 10 insertions(+), 89 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 463a4effa8..baa70c580c 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -150,11 +150,13 @@ def execute_node(self, context, graph): odt_is_bipolar = self.get_nodeattr("outputDataType") == "BIPOLAR" out_scale = 2 if odt_is_bipolar else 1 out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") - # NHWC to NCHW for multithreshold node - result = result.transpose((0, 3, 1, 2)) + if result.ndim == 4: + # NHWC to NCHW for multithreshold node + result = result.transpose((0, 3, 1, 2)) result = multithreshold(result, mvau_thr, out_scale, out_bias) - # NCHW to NHWC - result = result.transpose((0, 2, 3, 1)) + if result.ndim == 4: + # NCHW to NHWC + result = result.transpose((0, 2, 3, 1)) context[node.output[0]] = result @@ -436,84 +438,6 @@ def uram_efficiency_estimation(self): uram_est_capacity = uram_est * 72 * 4096 return wbits / uram_est_capacity - def lut_estimation(self): - """Calculates resource estimations for LUTs based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - MW = self.get_nodeattr("MW") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - # determine tdt with input and weight data types - idt = self.get_input_datatype() - A = idt.bitwidth() - # parameters from experiments in paper mentioned above - c0 = 300 - c1 = 1.1 - c2 = 0 - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 - ): - c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) - - # multiplication - res_type = self.get_nodeattr("resType") - if res_type == "dsp": - mult_luts = 0 - else: - mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) - # adder tree - addertree_luts = (W + A) * (2 * Q - 1) - # accumulator - acc_datatype = self.get_accumulator_datatype() - # if accDataType is not set, then it will default to INT32, which would - # be a large overestimate in most (if not all) cases. In this scenario, - # we would use the minimum accumulator as determined by the data types - # bound, derived in https://arxiv.org/abs/2301.13376 - alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed()) - acc_bits = min( - acc_datatype.bitwidth(), - np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), - ) - acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - tmem_style = self.get_nodeattr("ram_style_thresholds") - if (noact == 0) and (tmem_style == "distributed"): - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) - comp_luts = (2**B - 1) * acc_bits - - return int( - c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 - ) - - def dsp_estimation(self): - # multiplication - P = self.get_nodeattr("PE") - res_type = self.get_nodeattr("resType") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - idt = self.get_input_datatype() - A = idt.bitwidth() - if res_type == "dsp": - mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling - else: - mult_dsp = 0 - return int(mult_dsp) - def get_exp_cycles(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -953,12 +877,9 @@ def code_generation_ipi(self): "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) - + # Instantiate either the HLS or RTL IP depending on operator + self.instantiate_ip(cmd) + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" @@ -1029,7 +950,7 @@ def code_generation_ipi(self): cmd.append("save_bd_design") elif mem_mode == "const" or mem_mode == "external": # base class impl sufficient for const/external modes - return super().code_generation_ipi() + self.instantiate_ip(cmd) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd From 3a36ef12f0a918f597e15db95fc20ea53b6700fb Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 13 Feb 2024 15:46:07 +0000 Subject: [PATCH 128/291] [hls mvau]: added lut/dsp estimation functions, instantiate_ip method and bugfix to node execution --- .../hls/matrixvectoractivation_hls.py | 113 +++++++++++++++++- 1 file changed, 108 insertions(+), 5 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index 5206ee3a06..aa3631a240 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -33,6 +33,7 @@ from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from pyverilator.util.axi_utils import toggle_clk, reset_rtlsim # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -54,6 +55,84 @@ def get_nodeattr_types(self): my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "decoupled" and mstyle == "distributed") or ( + mmode == "const" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_datatype = self.get_accumulator_datatype() + # if accDataType is not set, then it will default to INT32, which would + # be a large overestimate in most (if not all) cases. In this scenario, + # we would use the minimum accumulator as determined by the data types + # bound, derived in https://arxiv.org/abs/2301.13376 + alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed()) + acc_bits = min( + acc_datatype.bitwidth(), + np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), + ) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + tmem_style = self.get_nodeattr("ram_style_thresholds") + if (noact == 0) and (tmem_style == "distributed"): + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 + ) + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + def get_template_param_values(self): """Returns the template parameter values according to input, output and weight data types.""" @@ -416,6 +495,7 @@ def execute_node(self, context, graph): mem_mode = self.get_nodeattr("mem_mode") node = self.onnx_node + # TODO ensure codegen dir exists if mode == "cppsim": code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") elif mode == "rtlsim": @@ -433,6 +513,7 @@ def execute_node(self, context, graph): for inputs in node.input: # it is assumed that the first input of the node is the data input # the second input are the weights + # the third input are the thresholds if in_ind == 0: assert ( str(context[inputs].dtype) == "float32" @@ -440,7 +521,12 @@ def execute_node(self, context, graph): not float32 as expected.""" expected_inp_shape = self.get_folded_input_shape() reshaped_input = context[inputs].reshape(expected_inp_shape) - export_idt = self.get_input_datatype() + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() # make copy before saving the array reshaped_input = reshaped_input.copy() np.save( @@ -468,11 +554,15 @@ def execute_node(self, context, graph): sim = self.get_rtlsim() nbits = self.get_instream_width() inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - self.reset_rtlsim(sim) - self.toggle_clk(sim) - if mem_mode in ["external", "decoupled"]: + reset_rtlsim(sim) + toggle_clk(sim) + if mem_mode == "external" or mem_mode == "decoupled": wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) io_dict = { @@ -489,6 +579,7 @@ def execute_node(self, context, graph): out_npy_path = "{}/output.npy".format(code_gen_dir) out_shape = self.get_folded_output_shape() rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + # load and reshape output output = np.load(out_npy_path) oshape = self.get_normal_output_shape() @@ -497,7 +588,19 @@ def execute_node(self, context, graph): else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to "rtlsim" """.format( + has to be set to one of the following value ("cppsim", "rtlsim")""".format( mode ) ) + + def instantiate_ip(self, cmd): + # instantiate the HLS IP + vlnv = self.get_nodeattr("ip_vlnv") + node_name = self.onnx_node.name + if self.get_nodeattr("mem_mode") == "decoupled": + cmd.append( + "create_bd_cell -type ip -vlnv %s /%s/%s" + % (vlnv, node_name, node_name) + ) + else: + cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)) \ No newline at end of file From 4266e0872686b967f3b2ec0d0d68d4d852138cb5 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 13 Feb 2024 16:12:09 +0000 Subject: [PATCH 129/291] [test]: added GiveUniqueNodeNames transform and changed RTLsim test preparation --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 7e632b4018..1853392724 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -273,6 +273,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): else: tdt = DataType["INT32"] model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + model = model.transform(GiveUniqueNodeNames()) for node in model.graph.node: # lookup op_type in registry of CustomOps inst = getCustomOp(node) @@ -280,6 +281,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # Note: only HLS-based MVAU layers execute CPPsim inst.set_nodeattr("preferred_impl_style", "hls") model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -389,8 +391,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... model = model.transform(SpecializeLayers()) - # model = model.transform(SetExecMode("rtlsim")) - model.set_metadata_prop("exec_mode", "rtlsim") + model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) model = model.transform(HLSSynthIP()) From 5dfc440695c4530e2e0cf517e7a60a373cfd6019 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 13 Feb 2024 16:14:55 +0000 Subject: [PATCH 130/291] post linting --- .../fpgadataflow/hls/matrixvectoractivation_hls.py | 10 ++++------ .../custom_op/fpgadataflow/matrixvectoractivation.py | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index aa3631a240..f2119667bf 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -26,14 +26,15 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import math import numpy as np import os +from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -from pyverilator.util.axi_utils import toggle_clk, reset_rtlsim # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -598,9 +599,6 @@ def instantiate_ip(self, cmd): vlnv = self.get_nodeattr("ip_vlnv") node_name = self.onnx_node.name if self.get_nodeattr("mem_mode") == "decoupled": - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (vlnv, node_name, node_name) - ) + cmd.append("create_bd_cell -type ip -vlnv %s /%s/%s" % (vlnv, node_name, node_name)) else: - cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)) \ No newline at end of file + cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index baa70c580c..1b6be752dc 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -879,7 +879,7 @@ def code_generation_ipi(self): ) # Instantiate either the HLS or RTL IP depending on operator self.instantiate_ip(cmd) - + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" From a6a3d4cc7dfb99ca48c2f543fe019c11681a7f21 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Tue, 13 Feb 2024 17:03:41 +0000 Subject: [PATCH 131/291] [tests] Split threshold runtime tests to runtime read and write tests --- .../test_fpgadataflow_thresholding.py | 167 ++++++++++++++---- 1 file changed, 128 insertions(+), 39 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 3daf44a055..f1be5f89a7 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -42,7 +42,7 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation -from finn.core.rtlsim_exec import rtlsim_exec +from finn.core.rtlsim_exec import rtlsim_exec, reset_rtlsim from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -150,7 +150,7 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ odt = act n_steps = act.get_num_possible_values() - 1 - + # Generate random, non-decreasing thresholds thresholds = generate_random_threshold_values( idt, ich, n_steps @@ -165,16 +165,16 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ # Build DUT model = make_single_thresholding_modelwrapper( - impl_style, - thresholds, - pe, - idt, - odt, - actval, - mem_mode, + impl_style, + thresholds, + pe, + idt, + odt, + actval, + mem_mode, n_inp_vecs ) - + # Expected Reference output # multithreshold util fxn wants NCHW input, not NHWC x_nchw = layout_FINN2NCHW(x) @@ -238,24 +238,29 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ assert exp_cycles != 0 @pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +@pytest.mark.parametrize("cf", [2]) +@pytest.mark.parametrize("ch", [6]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_single_layer(impl_style): +def test_runtime_thresholds_read(impl_style,cf,ch): + """ Read back threshold weights during runtime + + 1. Create random initial weights T + 2. Execute model + 3. Read back weights via AXI + 4. Compare with initial weights T + """ n_inp_vecs = [1, 2, 2] mem_mode = "decoupled" act = DataType["INT4"] idt = DataType["INT16"] - nf = 8 - ich = 16 - pe = ich // nf - assert ich % pe == 0 - - # generate input data - in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ich])) + pe = ch // cf + assert ch % pe == 0 odt = act n_steps = act.get_num_possible_values() - 1 - T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) + np.random.seed(2) + T = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) # provide non-decreasing thresholds T = np.sort(T, axis=1) @@ -290,10 +295,12 @@ def test_runtime_thresholds_single_layer(impl_style): # add two copies of the input tensor as the first one is just used to # "flush out" the pipeline (as mvau already starts receiving old weights while # we read/write new ones and reads seem to cause a disturbance too) + # generate input data + in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ch])) in_tensor = np.tile(in_tensor, (2, 1, 1, 1)) + exec_ctx = {"inp": in_tensor} extracted_weight_stream = [] - def read_weights(sim): addr = 0 for i in range(len(old_weight_stream)): @@ -301,51 +308,133 @@ def read_weights(sim): addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=read_weights) + + # Validate the AXI Read weights assert extracted_weight_stream == old_weight_stream - # only use second batch element in output; first will be invalid due to - # old weights (see above) - y = exec_ctx["outp"][1] + + y = exec_ctx["outp"][0] # multithreshold util fxn wants NCHW input, not NHWC expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), T) # convert back to NHWC for comparison to hw outputs expected = np.transpose(expected, (0, 2, 3, 1))[1] - # expected = multithreshold(in_tensor, T)[1] if act == DataType["BIPOLAR"]: - # binary to bipolar + # binary to bipolarW expected = 2 * expected - 1 else: # signed offset expected += act.min() + + # Validate the output is as expected assert (y == expected).all() - new_weights = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +@pytest.mark.parametrize("cf", [8]) +@pytest.mark.parametrize("ch", [16]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_runtime_thresholds_write(impl_style,cf,ch): + """ Write threshold weights during runtime + + 1. Create random initial weights T_init + 2. Create model with initial weights + 3. Create new set of weights T_write + 4. Write T_write using AXI bus + 5. Read back using AXI bus to T_read + 6. Compare T_write and T_read + 7. Validate outputs with expected vectors + """ + n_inp_vecs = [1, 2, 2] + mem_mode = "decoupled" + act = DataType["INT4"] + idt = DataType["INT16"] + pe = ch // cf + assert ch % pe == 0 + + odt = act + n_steps = act.get_num_possible_values() - 1 + np.random.seed(2) + T_init = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T_init = np.sort(T_init, axis=1) + + if odt == DataType["BIPOLAR"]: + actval = 0 + else: + actval = odt.min() + + model = make_single_thresholding_modelwrapper(impl_style, T_init, pe, idt, odt, actval, mem_mode, n_inp_vecs) + model = model.transform(SpecializeLayers()) + + # Validate that specialize layer did not default to HLS implementation + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) + + op_inst = getCustomOp(model.graph.node[0]) + op_inst.set_nodeattr("runtime_writeable_weights", 1) + + # Make new weights for runtime write + np.random.seed(4) + T_write = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) # provide non-decreasing thresholds - new_weights = np.sort(T, axis=1) - op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat") - with open("new_weights.dat", "r") as f: - new_weight_stream = f.read().strip() - os.remove("new_weights.dat") - new_weight_stream = map(lambda x: int(x, 16), new_weight_stream.split("\n")) - new_weight_stream = list(new_weight_stream) + T_write = np.sort(T_write, axis=1) + + op_inst.make_weight_file(T_write, "decoupled_runtime", "T_write.dat") + with open("T_write.dat", "r") as f: + T_write_stream = f.read().strip() + os.remove("T_write.dat") + + T_write_stream = map(lambda x: int(x, 16), T_write_stream.split("\n")) + T_write_stream = list(T_write_stream) + + # need to create stitched IP for runtime weight testing + model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + model = model.transform(PrepareRTLSim()) + model.set_metadata_prop("exec_mode", "rtlsim") + # add two copies of the input tensor as the first one is just used to + # "flush out" the pipeline (as mvau already starts receiving old weights while + # we read/write new ones and reads seem to cause a disturbance too) + # generate input data + in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ch])) + in_tensor = np.tile(in_tensor, (2, 1, 1, 1)) + # trace_file = "trace_wr_01.vcd" + # model.set_metadata_prop("rtlsim_trace",trace_file) + exec_ctx_write = {"inp": in_tensor} def write_weights(sim): addr = 0 - for nw in new_weight_stream: + for nw in T_write_stream: axilite_write(sim, addr, nw, basename="s_axilite_0_") addr += 4 + T_read_stream = [] + def read_weights(sim): + addr = 0 + for i in range(len(T_write_stream)): + T_read_stream.append(axilite_read(sim, addr, basename="s_axilite_0_")) + addr += 4 + + rtlsim_exec(model, exec_ctx_write, pre_hook=write_weights, post_hook=read_weights) + + y = exec_ctx_write["outp"][1] + + assert T_read_stream == T_write_stream - rtlsim_exec(model, exec_ctx, pre_hook=write_weights) - y = exec_ctx["outp"][1] # multithreshold util fxn wants NCHW input, not NHWC - expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), new_weights) + expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), T_write) # convert back to NHWC for comparison to hw outputs expected = np.transpose(expected, (0, 2, 3, 1))[1] + if act == DataType["BIPOLAR"]: - # binary to bipolar + # binary to bipolarW expected = 2 * expected - 1 else: # signed offset expected += act.min() - assert (y == expected).all() + + # Validate the output is as expected + assert (y == expected).all() \ No newline at end of file From 9c96192eeac5fd709eb72e9c0f1df0e2c480ba6c Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Tue, 13 Feb 2024 17:04:26 +0000 Subject: [PATCH 132/291] [CustomOp] Zero pad row of threshold weight dat file --- .../fpgadataflow/rtl/thresholding_rtl.py | 77 ++++++------------- 1 file changed, 24 insertions(+), 53 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 714930b73d..54797e1b94 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -183,7 +183,7 @@ def get_weight_datatype(self): def minimize_accumulator_width(self, model): "Minimize threshold width ('accumulator width' here due to convention)" thresholds = model.get_initializer(self.onnx_node.input[1]) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) min_threshold = thresholds.min() max_threshold = thresholds.max() min_input = self.get_input_datatype().min() @@ -248,7 +248,7 @@ def get_exp_cycles(self): # Channels/PE * batch size * fmdim * fmdim return np.prod(self.get_folded_output_shape()[:-1]) - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): """Convert the original numpy weight matrix orig_weight_matrix into a form suitable for passing to the hlslib call: * ensure MH % PE == 0 @@ -661,12 +661,13 @@ def get_verilog_top_module_intf_names(self): return intf_names - def get_dynamic_config(self, model, address_stride=1): + def get_dynamic_config(self, weights, address_stride=1): """Returns a configuration dictionary containing axilite write commands in order to program the thresholds into the RTL core during runtime. The default address stride for the weights is 1 byte.""" - thresholds = model.get_initializer(self.onnx_node.input[1]) + # thresholds = model.get_initializer(self.onnx_node.input[1]) + thresholds = weights num_channels, num_weights_per_channel = thresholds.shape weight_addr_boundary = find_next_power_of_2(num_weights_per_channel) @@ -740,7 +741,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): * weight_file_name : filename for the weight file to be generated """ - threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) + threshold_tensor = self.get_hw_compatible_threshold_tensor(weights) tdt = self.get_weight_datatype() assert np.vectorize(tdt.allowed)( threshold_tensor @@ -760,52 +761,22 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): 1, -1, pe * n_thres_steps ) decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy() - width_padded = roundup_to_integer_multiple(pe * n_thres_steps, 4) - - # zero pad the columns - thres_padded = np.zeros((1, self.calc_tmem() ,width_padded)) - thres_padded[0, :self.calc_tmem(), :(pe * n_thres_steps) ] = decoupled_thres_pe_flipped - decoupled_thres_pe_flipped = thres_padded.copy() - weight_tensor_pe_flipped = [] - if weight_file_mode == "decoupled_npy": - # save weight stream into npy for cppsim - np.save(weight_file_name, decoupled_thres) - elif weight_file_mode == "decoupled_verilog_dat": - # convert weight values into hexstring - weight_width = self.get_weightstream_width() - # pad to nearest 4 bits to get hex strings - weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - decoupled_thres_pe_flipped, tdt, weight_width_padded, prefix="" - ) - weight_stream = weight_tensor_pe_flipped.flatten() - weight_stream = weight_stream.copy() - with open(weight_file_name, "w") as f: - for val in weight_stream: - f.write(val + "\n") - elif weight_file_mode == "decoupled_runtime": - # memstream axi-lite interface will map each mem line to - # one or multiple 32-bit words - weight_width = self.get_weightstream_width() - words_per_memwidth = 2 ** ceil(log2(weight_width / 32)) - if words_per_memwidth < 1: - words_per_memwidth = 1 - weight_width_padded = words_per_memwidth * 32 # convert to bits - # first, pack and ensure padding to 32 bits - for channel in decoupled_thres_pe_flipped[0]: - for weight in channel: - wdt = self.get_weight_datatype() - bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32) - weight_tensor_pe_flipped.append(pack_innermost_dim_as_hex_string( - [weight], wdt, bw_hexdigit, prefix="" - ).item()) - weight_stream = weight_tensor_pe_flipped.copy() - - with open(weight_file_name, "w") as f: - for val in weight_stream: - f.write(val + "\n") - else: - raise Exception("Decoupled weight export not yet implemented") - else: - raise Exception("Unknown weight_file_mode") + width_padded = roundup_to_integer_multiple(weights.shape[1], 4) + # # zero pad the row + weight_padded = np.zeros((weights.shape[0],width_padded)) + weight_padded[:weights.shape[0], :n_thres_steps ] = weights + weight_stream = [] + for channel in weight_padded: + for weight in channel: + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32) + weight_stream.append(pack_innermost_dim_as_hex_string( + [weight], wdt, bw_hexdigit, prefix="" + ).item()) + + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") + else: + raise Exception("Unknown weight_file_mode") \ No newline at end of file From 526e71fb61ff29758aef47d874b34f59c0451c3a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 16 Feb 2024 15:19:05 +0000 Subject: [PATCH 133/291] [hls mvau]: minor style change --- .../custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index f2119667bf..5b85323f32 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -601,4 +601,4 @@ def instantiate_ip(self, cmd): if self.get_nodeattr("mem_mode") == "decoupled": cmd.append("create_bd_cell -type ip -vlnv %s /%s/%s" % (vlnv, node_name, node_name)) else: - cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)) + cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, node_name)) \ No newline at end of file From 1091ce9214a98daaf1f127f2cc86e54e075d0640 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 16 Feb 2024 15:25:57 +0000 Subject: [PATCH 134/291] [Builder] Expose swg expection for FIFOs to build args --- src/finn/builder/build_dataflow_config.py | 4 ++++ src/finn/builder/build_dataflow_steps.py | 1 + 2 files changed, 5 insertions(+) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 1b22265a4d..4cbcfb21c3 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -296,6 +296,10 @@ class DataflowBuildConfig: #: Which memory mode will be used for compute layers default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED + #: Call CapConvolutionFIFODepths in InsertAndSetFIFODepths transform + #: to make convolution FIFOs smaller where appropriate + default_swg_exception: Optional[bool] = False + #: Which Vitis platform will be used. #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO` #: e.g. "xilinx_u250_xdma_201830_2" diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index d031e971f1..a75bbe98a1 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -573,6 +573,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): InsertAndSetFIFODepths( cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period(), + swg_exception=cfg.default_swg_exception, vivado_ram_style=cfg.large_fifo_mem_style, force_python_sim=force_python_sim, ) From 462a79c6ab308fef29885b9c5911aec8b3634d69 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 16 Feb 2024 15:28:57 +0000 Subject: [PATCH 135/291] linting --- .../custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index 5b85323f32..f40e6d78e8 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -601,4 +601,4 @@ def instantiate_ip(self, cmd): if self.get_nodeattr("mem_mode") == "decoupled": cmd.append("create_bd_cell -type ip -vlnv %s /%s/%s" % (vlnv, node_name, node_name)) else: - cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, node_name)) \ No newline at end of file + cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, node_name)) From f31f8449c21950daf9543333b09e66c472eac068 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 16 Feb 2024 15:30:02 +0000 Subject: [PATCH 136/291] [IPStitching] Check if node has hls or rtl backend --- .../transformation/fpgadataflow/create_stitched_ip.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 0ce0923934..a8ecdcf484 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,7 +41,7 @@ ReplaceVerilogRelPaths, ) from finn.util.basic import make_build_dir -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def is_external_input(model, node, i): @@ -302,7 +303,9 @@ def apply(self, model): ) for node in model.graph.node: # ensure that all nodes are fpgadataflow, and that IPs are generated - assert is_fpgadataflow_node(node), "All nodes must be FINN fpgadataflow nodes." + assert is_hls_node(node) or is_rtl_node( + node + ), "All nodes must be FINN fpgadataflow nodes." node_inst = getCustomOp(node) ip_dir_value = node_inst.get_nodeattr("ip_path") assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist." From 91679a1f7af6b8dc6ee630bd366730cced3bfa27 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 16 Feb 2024 16:48:35 +0000 Subject: [PATCH 137/291] [MVAU] Shorten op type MatrixVectorActivation to MVAU --- .../analysis/fpgadataflow/res_estimation.py | 10 ++++---- src/finn/custom_op/fpgadataflow/__init__.py | 4 ++-- .../custom_op/fpgadataflow/hls/__init__.py | 6 ++--- .../hls/matrixvectoractivation_hls.py | 6 ++--- .../fpgadataflow/matrixvectoractivation.py | 5 +--- .../build_dataflow/folding_config.json | 12 +++++----- .../specialize_layers_config.json | 8 +++---- .../test_ext_weights/tfc-w1a1-extw.json | 12 +++++----- .../fpgadataflow/convert_to_hw_layers.py | 12 +++++----- .../fpgadataflow/create_stitched_ip.py | 2 +- .../transformation/fpgadataflow/floorplan.py | 2 +- .../transformation/fpgadataflow/insert_dwc.py | 2 +- .../fpgadataflow/insert_iodma.py | 3 +-- .../fpgadataflow/insert_tlastmarker.py | 4 ++-- .../fpgadataflow/make_pynq_driver.py | 4 +--- .../fpgadataflow/make_zynq_proj.py | 4 +--- .../fpgadataflow/set_fifo_depths.py | 10 ++++---- .../fpgadataflow/set_folding.py | 2 +- .../fpgadataflow/specialize_layers.py | 2 +- src/finn/transformation/move_reshape.py | 2 +- src/finn/util/create.py | 2 +- tests/end2end/test_end2end_bnn_pynq.py | 24 +++++++++---------- tests/end2end/test_end2end_cybsec_mlp.py | 4 ++-- tests/end2end/test_end2end_mobilenet_v1.py | 2 +- tests/fpgadataflow/test_code_gen_trafo.py | 2 +- tests/fpgadataflow/test_compilation_trafo.py | 2 +- .../test_convert_to_hw_1d_conv_layer.py | 2 +- .../test_convert_to_hw_conv_layer.py | 2 +- .../test_convert_to_hw_layers_cnv.py | 4 ++-- .../test_convert_to_hw_layers_fc.py | 16 ++++++------- .../test_fpgadataflow_checksum.py | 4 ++-- ...dataflow_convinputgenerator_rtl_dynamic.py | 2 +- .../fpgadataflow/test_fpgadataflow_deconv.py | 10 ++++---- .../test_fpgadataflow_ipstitch.py | 8 +++---- tests/fpgadataflow/test_fpgadataflow_mvau.py | 12 +++++----- .../test_fpgadataflow_res_estimate.py | 8 ++++--- tests/fpgadataflow/test_minimize_bit_width.py | 16 ++++++------- tests/fpgadataflow/test_set_folding.py | 2 +- tests/fpgadataflow/test_split_large_fifos.py | 6 ++--- 39 files changed, 116 insertions(+), 124 deletions(-) diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index 000e1208d7..c2d0cf7048 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -28,7 +28,7 @@ import qonnx.custom_op.registry as registry -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def res_estimation(model): @@ -41,7 +41,7 @@ def res_estimation(model): res_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): inst = registry.getCustomOp(node) res_dict[node.name] = inst.node_res_estimation() @@ -59,12 +59,10 @@ def res_estimation_complete(model): res_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): inst = registry.getCustomOp(node) op_type = node.op_type - if op_type.startswith("MatrixVectorActivation") or op_type.startswith( - "VectorVectorActivation" - ): + if op_type.startswith("MVAU") or op_type.startswith("VectorVectorActivation"): orig_restype = inst.get_nodeattr("resType") res_dict[node.name] = [] inst.set_nodeattr("resType", "dsp") diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index d4c9904fe1..6154bdc924 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -40,7 +40,7 @@ from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool from finn.custom_op.fpgadataflow.labelselect import LabelSelect from finn.custom_op.fpgadataflow.lookup import Lookup -from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU from finn.custom_op.fpgadataflow.pool import Pool from finn.custom_op.fpgadataflow.streamingdataflowpartition import ( StreamingDataflowPartition, @@ -59,7 +59,7 @@ # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure -custom_op["MatrixVectorActivation"] = MatrixVectorActivation +custom_op["MVAU"] = MVAU custom_op["StreamingFIFO"] = StreamingFIFO custom_op["Thresholding"] = Thresholding custom_op["VectorVectorActivation"] = VectorVectorActivation diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 1e2c83ba39..6e465fd0f2 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -41,9 +41,7 @@ from finn.custom_op.fpgadataflow.hls.iodma_hls import IODMA_hls from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls -from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import ( - MatrixVectorActivation_hls, -) +from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MVAU_hls from finn.custom_op.fpgadataflow.hls.pool_hls import Pool_hls from finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls import ( StreamingDataWidthConverter_hls, @@ -81,5 +79,5 @@ custom_op["Thresholding_hls"] = Thresholding_hls custom_op["TLastMarker_hls"] = TLastMarker_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls -custom_op["MatrixVectorActivation_hls"] = MatrixVectorActivation_hls +custom_op["MVAU_hls"] = MVAU_hls custom_op["VectorVectorActivation_hls"] = VectorVectorActivation_hls diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index f40e6d78e8..c6ca66e15d 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -33,7 +33,7 @@ from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend -from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy # ONNX i/o tensor shape assumptions for MatrixVectorActivation: @@ -44,7 +44,7 @@ # the ... here can be any shape (representing groups of vectors) -class MatrixVectorActivation_hls(MatrixVectorActivation, HLSBackend): +class MVAU_hls(MVAU, HLSBackend): """Corresponds to finn-hlslib MatrixVectorActivation_Batch function.""" def __init__(self, onnx_node, **kwargs): @@ -52,7 +52,7 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): my_attrs = {} - my_attrs.update(MatrixVectorActivation.get_nodeattr_types(self)) + my_attrs.update(MVAU.get_nodeattr_types(self)) my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 1b6be752dc..ac173e4af6 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -51,7 +51,7 @@ # the ... here can be any shape (representing groups of vectors) -class MatrixVectorActivation(HWCustomOp): +class MVAU(HWCustomOp): """Abstraction layer for HW implementation of MatrixVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): @@ -122,9 +122,6 @@ def get_nodeattr_types(self): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def base_op_type(self): - return "MatrixVectorActivation" - def execute_node(self, context, graph): node = self.onnx_node in_act = context[node.input[0]] diff --git a/src/finn/qnn-data/build_dataflow/folding_config.json b/src/finn/qnn-data/build_dataflow/folding_config.json index 95167f1a30..46f1d6236d 100644 --- a/src/finn/qnn-data/build_dataflow/folding_config.json +++ b/src/finn/qnn-data/build_dataflow/folding_config.json @@ -1,30 +1,30 @@ { "Defaults": {}, - "Thresholding_Batch_0": { + "Thresholding_hls_0": { "PE": 49, "ram_style": "distributed" }, - "MatrixVectorActivation_0": { + "MVAU_hls_0": { "PE": 16, "SIMD": 49, "ram_style": "block" }, - "MatrixVectorActivation_1": { + "MVAU_hls_1": { "PE": 8, "SIMD": 8, "ram_style": "auto" }, - "MatrixVectorActivation_2": { + "MVAU_hls_2": { "PE": 8, "SIMD": 8, "ram_style": "auto" }, - "MatrixVectorActivation_3": { + "MVA_hls_3": { "PE": 10, "SIMD": 8, "ram_style": "distributed" }, - "LabelSelect_Batch_0": { + "LabelSelect_hls_0": { "PE": 1 } } diff --git a/src/finn/qnn-data/build_dataflow/specialize_layers_config.json b/src/finn/qnn-data/build_dataflow/specialize_layers_config.json index 4fc37896db..c2a8bd4553 100644 --- a/src/finn/qnn-data/build_dataflow/specialize_layers_config.json +++ b/src/finn/qnn-data/build_dataflow/specialize_layers_config.json @@ -3,25 +3,25 @@ "Thresholding_0": { "preferred_impl_style": "hls" }, - "MatrixVectorActivation_0": { + "MVAU_0": { "preferred_impl_style": "hls" }, "Thresholding_1": { "preferred_impl_style": "" }, - "MatrixVectorActivation_1": { + "MVAU_1": { "preferred_impl_style": "" }, "Thresholding_2": { "preferred_impl_style": "" }, - "MatrixVectorActivation_2": { + "MVAU_2": { "preferred_impl_style": "" }, "Thresholding_3": { "preferred_impl_style": "rtl" }, - "MatrixVectorActivation_3": { + "MVAU_3": { "preferred_impl_style": "" }, "LabelSelect_0": { diff --git a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json index 442ea72d9a..498d329ba3 100644 --- a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json +++ b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json @@ -1,30 +1,30 @@ { "Defaults": {}, - "Thresholding_Batch_0": { + "Thresholding_hls_0": { "PE": 49, "ram_style": "distributed" }, - "MatrixVectorActivation_0": { + "MVAU_hls_0": { "PE": 16, "SIMD": 49, "ram_style": "block" }, - "MatrixVectorActivation_1": { + "MVAU_hls_1": { "PE": 8, "SIMD": 8, "mem_mode": "external" }, - "MatrixVectorActivation_2": { + "MVAU_hls_2": { "PE": 8, "SIMD": 8, "mem_mode": "external" }, - "MatrixVectorActivation_3": { + "MVAU_hls_3": { "PE": 10, "SIMD": 8, "ram_style": "distributed" }, - "LabelSelect_Batch_0": { + "LabelSelect_hls_0": { "PE": 1 } } diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index ade76afdde..014a5c82bd 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1356,7 +1356,7 @@ def apply(self, model): model.set_tensor_shape(mt_output, mt_out_shape) # create and insert new MatrixVectorActivation node new_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", [mm_input, mm_weight, mt_thres], [mt_output], domain="finn.custom_op.fpgadataflow", @@ -1387,7 +1387,7 @@ def apply(self, model): model.set_tensor_shape(mm_output, mm_out_shape) # create and insert new MatrixVectorActivation node new_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", [mm_input, mm_weight], [mm_output], domain="finn.custom_op.fpgadataflow", @@ -1493,7 +1493,7 @@ def apply(self, model): actval = 0 # create and insert new MatrixVectorActivation node new_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", [mm_input, mm_weight, mt_thres], [mt_output], domain="finn.custom_op.fpgadataflow", @@ -1510,7 +1510,7 @@ def apply(self, model): noActivation=0, numInputVectors=list(mm_in_shape[:-1]), mem_mode=self.mem_mode, - name="MatrixVectorActivation_" + n.name, + name="MVAU_" + n.name, ) graph.node.insert(node_ind, new_node) # remove old nodes @@ -1524,7 +1524,7 @@ def apply(self, model): model.set_tensor_shape(mm_output, mm_out_shape) # create and insert new MatrixVectorActivation node new_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", [mm_input, mm_weight], [mm_output], domain="finn.custom_op.fpgadataflow", @@ -1541,7 +1541,7 @@ def apply(self, model): noActivation=1, numInputVectors=list(mm_in_shape[:-1]), mem_mode=self.mem_mode, - name="MatrixVectorActivation_" + n.name, + name="MVAU_" + n.name, ) graph.node.insert(node_ind, new_node) # remove old node diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index a8ecdcf484..4212e2b58a 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -55,7 +55,7 @@ def is_external_input(model, node, i): if model.get_initializer(node.input[i]) is None: return True else: - if op_type.startswith("MatrixVectorActivation"): + if op_type.startswith("MVAU"): if node_inst.get_nodeattr("mem_mode") == "external": return True return False diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index 5231fc288b..b24145afcb 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -150,7 +150,7 @@ def apply(self, model): continue elif not ( - node.op_type.startswith("MatrixVectorActivation") + node.op_type.startswith("MVAU") and node_inst.get_nodeattr("mem_mode") is not None and node_inst.get_nodeattr("mem_mode") == "external" ): diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index e3a52f68f0..100beefcc2 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -88,7 +88,7 @@ def apply(self, model): # - if FC and external mem, it could be connected to input 1 # - if concat, could be connected to any input if ( - consumer.op_type.startswith("MatrixVectorActivation") + consumer.op_type.startswith("MVAU") and n1.get_nodeattr("mem_mode") == "external" ) or (consumer.op_type.startswith("StreamingConcat")): # get input idx diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index f3334d94f5..96f23ca320 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -199,8 +199,7 @@ def apply(self, model): # attached IODMA fc_extw_nodes = list( filter( - lambda x: x.op_type - in ["MatrixVectorActivation_hls", "VectorVectorActivation_hls"] + lambda x: x.op_type in ["MVAU_hls", "VectorVectorActivation_hls"] and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 431ca8e0b5..2131100dcf 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -103,7 +103,7 @@ def apply(self, model): # the input is in the list of graph inputs because it has an # initializer (TODO: fix this with a clean-up transform) if ( - first_node.op_type.startswith("MatrixVectorActivation") + first_node.op_type.startswith("MVAU") and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8") != "external" ): @@ -117,7 +117,7 @@ def apply(self, model): num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) inp_idx = list(first_node.input).index(graph_in_name) if inp_idx > 0: - if first_node.op_type.startswith("MatrixVectorActivation") and inp_idx == 1: + if first_node.op_type.startswith("MVAU") and inp_idx == 1: stream_width = int(custom_op.get_weightstream_width()) elif first_node.op_type.startswith("AddStreams") and inp_idx == 1: stream_width = int(custom_op.get_instream_width()) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index 9a5317e588..ea9bd2aa26 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -282,9 +282,7 @@ def apply(self, model): dataflow_model = ModelWrapper(dataflow_model_filename) rt_layer_ind = 0 for node in dataflow_model.graph.node: - if node.op_type.startswith("MatrixVectorActivation") or node.op_type.startswith( - "Thresholding" - ): + if node.op_type.startswith("MVAU") or node.op_type.startswith("Thresholding"): node_inst = getCustomOp(node) is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") if is_rt_weights == 1: diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 65095f1de7..7e3754e41e 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -64,9 +64,7 @@ def collect_ip_dirs(model, ipstitch_path): ), """The directory that should contain the generated ip blocks doesn't exist.""" ip_dirs += [ip_dir_value] - if node.op_type.startswith("MatrixVectorActivation") or node.op_type.startswith( - "Thresholding" - ): + if node.op_type.startswith("MVAU") or node.op_type.startswith("Thresholding"): if node_inst.get_nodeattr("mem_mode") == "decoupled": need_memstreamer = True ip_dirs += [ipstitch_path + "/ip"] diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 5d3b42b0c0..1e25670a71 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -176,7 +176,7 @@ def apply(self, model): continue if fifo_cons is None: continue - if not fifo_cons.op_type.startswith("MatrixVectorActivation"): + if not fifo_cons.op_type.startswith("MVAU"): continue op_inst = getCustomOp(node) depth = op_inst.get_nodeattr("depth") @@ -259,7 +259,7 @@ def __init__( def apply(self, model): # these optypes may potentially use external weights # we'll temporarily change them to use decoupled mode for FIFO sizing - extw_optypes = ["MatrixVectorActivation_hls", "VectorVectorActivation_hls"] + extw_optypes = ["MVAU_hls", "VectorVectorActivation_hls"] # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] @@ -568,7 +568,7 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type.startswith("StreamingFIFO"): + if node.op_type == ("StreamingFIFO_rtl"): n_inst = getCustomOp(node) depth = n_inst.get_nodeattr("depth") cfgs = get_fifo_split_configs(depth, self.max_qsrl_depth, self.max_vivado_depth) @@ -593,10 +593,10 @@ def apply(self, model): graph.value_info.append(out_tensor) model.set_tensor_datatype(out_tensor.name, DataType[dtype]) fifo_node = helper.make_node( - "StreamingFIFO", + "StreamingFIFO_rtl", [inp], [outp], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.rtl", backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 83f4138668..28358fdacc 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -124,7 +124,7 @@ def apply(self, model): continue op_type = node.op_type node_inst = getCustomOp(node) - if op_type == "MatrixVectorActivation_hls": + if op_type == "MVAU_hls": max_simd = node_inst.get_nodeattr("MW") max_pe = node_inst.get_nodeattr("MH") node_inst.set_nodeattr("PE", 1) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 691d7aed34..6c94f45d16 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -35,7 +35,7 @@ from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants restricted_layers = [] -restricted_layers.append("MatrixVectorActivation") +restricted_layers.append("MVAU") restricted_layers.append("VectorVectorActivation") restricted_layers.append("Thresholding") diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py index 6b5fa5516f..a13ecee80f 100644 --- a/src/finn/transformation/move_reshape.py +++ b/src/finn/transformation/move_reshape.py @@ -50,7 +50,7 @@ def apply(self, model): producer = model.find_producer(transp_node.input[0]) if _is_fpgadataflow_node(producer) is True: consumer = model.find_consumer(n.output[0]) - if consumer.op_type.startswith("MatrixVectorActivation"): + if consumer.op_type.startswith("MVAU"): fc_inst = getCustomOp(consumer) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") diff --git a/src/finn/util/create.py b/src/finn/util/create.py index af92d1cb8e..09ec4f334c 100644 --- a/src/finn/util/create.py +++ b/src/finn/util/create.py @@ -143,7 +143,7 @@ def hls_mlp_maker(layer_spec): actval = 0 no_act = 1 FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", node_inp_list, [current_out_name], domain="finn.custom_op.fpgadataflow", diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index b718e62fdf..bdede35244 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -122,7 +122,7 @@ def get_checkpoint_name(topology, wbits, abits, step): def fold_tfc(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation_hls") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # (PE, SIMD, ramstyle) for each layer config = [(16, 49, "block"), (8, 8, "auto"), (8, 8, "auto"), (10, 8, "distributed")] for fcl, (pe, simd, ramstyle) in zip(fc_layers, config): @@ -140,7 +140,7 @@ def fold_tfc(model): def fold_lfc(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation_hls") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # (PE, SIMD, ramstyle) for each layer config = [ (32, 49, "block"), @@ -162,7 +162,7 @@ def fold_lfc(model): def fold_cnv_large(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation_hls") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # each tuple is (PE, SIMD) for a layer folding = [ (16, 3), @@ -189,7 +189,7 @@ def fold_cnv_large(model): def fold_cnv_small(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation_hls") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # each tuple is (PE, SIMD) for a layer folding = [ (8, 3, "distributed"), @@ -560,26 +560,26 @@ def test_convert_to_hw_layers(self, topology, wbits, abits, board): "tfc": [ ("Reshape", 1), ("Thresholding", 1), - ("MatrixVectorActivation", 4), + ("MVAU", 4), ("LabelSelect", 1), ], "tfc-1-1": [ ("Reshape", 1), ("Thresholding", 4), - ("MatrixVectorActivation", 4), + ("MVAU", 4), ("LabelSelect", 1), ], "lfc": [ ("Reshape", 1), ("Thresholding", 1), - ("MatrixVectorActivation", 4), + ("MVAU", 4), ("LabelSelect", 1), ], "cnv": [ ("Transpose", 1), ("Thresholding", 1), ("ConvolutionInputGenerator", 6), - ("MatrixVectorActivation", 9), + ("MVAU", 9), ("StreamingMaxPool", 2), ("LabelSelect", 1), ], @@ -607,26 +607,26 @@ def test_specialize_layers(self, topology, wbits, abits, board): "tfc": [ ("Reshape", 1), ("Thresholding_hls", 1), - ("MatrixVectorActivation_hls", 4), + ("MVAU_hls", 4), ("LabelSelect_hls", 1), ], "tfc-1-1": [ ("Reshape", 1), ("Thresholding_hls", 4), - ("MatrixVectorActivation_hls", 4), + ("MVAU_hls", 4), ("LabelSelect_hls", 1), ], "lfc": [ ("Reshape", 1), ("Thresholding_hls", 1), - ("MatrixVectorActivation_hls", 4), + ("MVAU_hls", 4), ("LabelSelect_hls", 1), ], "cnv": [ ("Transpose", 1), ("Thresholding_hls", 1), ("ConvolutionInputGenerator_hls", 6), - ("MatrixVectorActivation_hls", 9), + ("MVAU_hls", 9), ("StreamingMaxPool_hls", 2), ("LabelSelect_hls", 1), ], diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py index b58b9f472c..9ee07d57a3 100644 --- a/tests/end2end/test_end2end_cybsec_mlp.py +++ b/tests/end2end/test_end2end_cybsec_mlp.py @@ -182,8 +182,8 @@ def test_end2end_cybsec_mlp_build(): # examine the report contents with open(est_cycles_report, "r") as f: est_cycles_dict = json.load(f) - assert est_cycles_dict["MatrixVectorActivation_hls_0"] == 80 - assert est_cycles_dict["MatrixVectorActivation_hls_1"] == 64 + assert est_cycles_dict["MVAU_hls_0"] == 80 + assert est_cycles_dict["MVAU_hls_1"] == 64 with open(est_res_report, "r") as f: est_res_dict = json.load(f) assert est_res_dict["total"]["LUT"] == 7899.0 diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index ba52548290..1fceda8141 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -256,7 +256,7 @@ def test_end2end_mobilenet_folding(): assert extra_fold in [1, 2, 4] # set up folding for the depthwise conv layers impl'd by VVAUs # each value is PE for a layer - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation_hls") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # each tuple is (PE, SIMD, ram_style) for a layer folding = [ (32, 3, "block"), diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py index 709333949e..deb9dd43b4 100644 --- a/tests/fpgadataflow/test_code_gen_trafo.py +++ b/tests/fpgadataflow/test_code_gen_trafo.py @@ -51,7 +51,7 @@ def test_code_gen_trafo(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, mh]) node_inp_list = ["inp", "weights", "thresh"] FCLayer_node = helper.make_node( - "MatrixVectorActivation_hls", + "MVAU_hls", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow.hls", diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py index 1b48df3d4a..7022311d4c 100644 --- a/tests/fpgadataflow/test_compilation_trafo.py +++ b/tests/fpgadataflow/test_compilation_trafo.py @@ -52,7 +52,7 @@ def test_compilation_trafo(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, mh]) node_inp_list = ["inp", "weights", "thresh"] FCLayer_node = helper.make_node( - "MatrixVectorActivation_hls", + "MVAU_hls", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow.hls", diff --git a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py index 55f46e321b..5e06cf9904 100644 --- a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py @@ -150,7 +150,7 @@ def test_convert_to_hw_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_m else: new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) new_model = new_model.transform(SpecializeLayers()) - fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation_hls")[0] + fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] fc_inst = getCustomOp(fc_node) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") diff --git a/tests/fpgadataflow/test_convert_to_hw_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py index 8cade1bfa1..ddcf386377 100644 --- a/tests/fpgadataflow/test_convert_to_hw_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py @@ -135,7 +135,7 @@ def test_convert_to_hw_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode else: new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) new_model = new_model.transform(SpecializeLayers()) - fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation_hls")[0] + fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] fc_inst = getCustomOp(fc_node) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py index 117a9a5850..64ccebf97a 100644 --- a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py @@ -111,7 +111,7 @@ def test_convert_to_hw_layers_cnv_w1a1(fused_activation): model = model.transform(to_hw.InferStreamingMaxPool()) model = model.transform(SpecializeLayers()) for node in model.graph.node: - if node.op_type == "MatrixVectorActivation_hls": + if node.op_type == "MVAU_hls": inst = getCustomOp(node) inst.set_nodeattr("mem_mode", "decoupled") mw = inst.get_nodeattr("MW") @@ -138,7 +138,7 @@ def test_convert_to_hw_layers_cnv_w1a1(fused_activation): assert len(non_finn_nodes) == 5 exp_non_finn_nodes = ["Transpose", "Transpose", "Reshape", "Mul", "Add"] assert [x.op_type for x in non_finn_nodes] == exp_non_finn_nodes - fc_nodes = model.get_nodes_by_op_type("MatrixVectorActivation_hls") + fc_nodes = model.get_nodes_by_op_type("MVAU_hls") assert len(fc_nodes) == 9 swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") assert len(swg_nodes) == 6 diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_fc.py b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py index 13f6a4393e..d00521f09f 100644 --- a/tests/fpgadataflow/test_convert_to_hw_layers_fc.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py @@ -84,22 +84,22 @@ def test_convert_to_hw_layers_tfc_w1a1(): model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) model = model.transform(SpecializeLayers()) fc0 = model.graph.node[2] - assert fc0.op_type == "MatrixVectorActivation_hls" + assert fc0.op_type == "MVAU_hls" assert model.get_tensor_shape(fc0.input[0]) == [1, 784] assert model.get_tensor_shape(fc0.input[1]) == [784, 64] assert model.get_tensor_shape(fc0.input[2]) == [64, 1] fc1 = model.graph.node[3] - assert fc1.op_type == "MatrixVectorActivation_hls" + assert fc1.op_type == "MVAU_hls" assert model.get_tensor_shape(fc1.input[0]) == [1, 64] assert model.get_tensor_shape(fc1.input[1]) == [64, 64] assert model.get_tensor_shape(fc1.input[2]) == [64, 1] fc2 = model.graph.node[4] - assert fc2.op_type == "MatrixVectorActivation_hls" + assert fc2.op_type == "MVAU_hls" assert model.get_tensor_shape(fc2.input[0]) == [1, 64] assert model.get_tensor_shape(fc2.input[1]) == [64, 64] assert model.get_tensor_shape(fc2.input[2]) == [64, 1] fc3 = model.graph.node[5] - assert fc3.op_type == "MatrixVectorActivation_hls" + assert fc3.op_type == "MVAU_hls" assert model.get_tensor_shape(fc3.input[0]) == [1, 64] assert model.get_tensor_shape(fc3.input[1]) == [64, 10] @@ -157,22 +157,22 @@ def test_convert_to_hw_layers_tfc_w1a2(): model = model.transform(SpecializeLayers()) fc0 = model.graph.node[2] - assert fc0.op_type == "MatrixVectorActivation_hls" + assert fc0.op_type == "MVAU_hls" assert model.get_tensor_shape(fc0.input[0]) == [1, 784] assert model.get_tensor_shape(fc0.input[1]) == [784, 64] assert model.get_tensor_shape(fc0.input[2]) == [64, 2] fc1 = model.graph.node[3] - assert fc1.op_type == "MatrixVectorActivation_hls" + assert fc1.op_type == "MVAU_hls" assert model.get_tensor_shape(fc1.input[0]) == [1, 64] assert model.get_tensor_shape(fc1.input[1]) == [64, 64] assert model.get_tensor_shape(fc1.input[2]) == [64, 2] fc2 = model.graph.node[4] - assert fc2.op_type == "MatrixVectorActivation_hls" + assert fc2.op_type == "MVAU_hls" assert model.get_tensor_shape(fc2.input[0]) == [1, 64] assert model.get_tensor_shape(fc2.input[1]) == [64, 64] assert model.get_tensor_shape(fc2.input[2]) == [64, 2] fc3 = model.graph.node[5] - assert fc3.op_type == "MatrixVectorActivation_hls" + assert fc3.op_type == "MVAU_hls" assert model.get_tensor_shape(fc3.input[0]) == [1, 64] assert model.get_tensor_shape(fc3.input[1]) == [64, 10] fc0w = getCustomOp(fc0) diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py index 71d4d60c06..c51030764c 100644 --- a/tests/fpgadataflow/test_fpgadataflow_checksum.py +++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py @@ -72,7 +72,7 @@ def create_two_fc_model(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MatrixVectorActivation", + "MVAU", ["inp", "w0"], ["mid"], domain="finn.custom_op.fpgadataflow", @@ -91,7 +91,7 @@ def create_two_fc_model(): ) fc1 = helper.make_node( - "MatrixVectorActivation", + "MVAU", ["mid", "w1"], ["outp"], domain="finn.custom_op.fpgadataflow", diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py index a05dd53e28..766a294977 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py @@ -269,7 +269,7 @@ def test_fpgadataflow_conv_dynamic(cfg): getCustomOp(swg_node).set_nodeattr("dynamic_mode", 1) getCustomOp(swg_node).set_nodeattr("inFIFODepths", [16]) getCustomOp(swg_node).set_nodeattr("outFIFODepths", [16]) - comp_nodes = model.get_nodes_by_op_type("MatrixVectorActivation_hls") + comp_nodes = model.get_nodes_by_op_type("MVAU_hls") comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation_hls") for comp_node in comp_nodes: if depthwise: diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py index 9c333e6808..ce8e1ce003 100644 --- a/tests/fpgadataflow/test_fpgadataflow_deconv.py +++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py @@ -41,7 +41,10 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hw_layers import InferConvInpGen +from finn.transformation.fpgadataflow.convert_to_hw_layers import ( + InferConvInpGen, + InferQuantizedMatrixVectorActivation, +) from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.infer_pixel_padding_deconv import ( InferPixelPaddingDeconv, @@ -164,8 +167,7 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, model = ref_model.transform(InferPixelPaddingDeconv()) model = model.transform(InferConvInpGen(use_rtl_variant=convinpgen_rtl)) - # TODO: uncomment when MV(A)U is in new class hierarchy - # model = model.transform(InferQuantizedMatrixVectorActivation()) + model = model.transform(InferQuantizedMatrixVectorActivation()) model = model.transform(InferShapes()) model = model.transform(GiveUniqueNodeNames()) @@ -178,7 +180,7 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, elif n.op_type == "FMPadding": pad_node = getCustomOp(n) pad_node.set_nodeattr("preferred_impl_style", "hls") - elif n.op_type == "MatrixVectorActivation": + elif n.op_type == "MVAU": mvau_node = getCustomOp(n) mvau_node.set_nodeattr("PE", pe) mvau_node.set_nodeattr("SIMD", simd) diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index 846f2c1fe0..ab62b2d476 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -79,7 +79,7 @@ def create_one_fc_model(mem_mode="const"): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MatrixVectorActivation_hls", + "MVAU_hls", ["inp", "w0"], ["outp"], domain="finn.custom_op.fpgadataflow.hls", @@ -131,7 +131,7 @@ def create_two_fc_model(mem_mode="decoupled"): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MatrixVectorActivation_hls", + "MVAU_hls", ["inp", "w0"], ["mid"], domain="finn.custom_op.fpgadataflow.hls", @@ -150,7 +150,7 @@ def create_two_fc_model(mem_mode="decoupled"): ) fc1 = helper.make_node( - "MatrixVectorActivation_hls", + "MVAU_hls", ["mid", "w1"], ["outp"], domain="finn.custom_op.fpgadataflow.hls", @@ -209,7 +209,7 @@ def test_fpgadataflow_ipstitch_gen_model(mem_mode): model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, 5)) model = model.transform(HLSSynthIP()) - assert model.graph.node[0].op_type == "MatrixVectorActivation_hls" + assert model.graph.node[0].op_type == "MVAU_hls" assert model.graph.node[-1].op_type == "TLastMarker_hls" model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model_%s.onnx" % mem_mode) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 1853392724..f202b094e8 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -91,7 +91,7 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non actval = 0 no_act = 1 FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -400,9 +400,9 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_hls_0" in hls_synt_res_est + assert "MVAU_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("MatrixVectorActivation_hls")[0] + node = model.get_nodes_by_op_type("MVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) @@ -505,11 +505,11 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( hls_synt_res_est = model.analysis(hls_synth_res_estimation) if backend == "hls": - assert "MatrixVectorActivation_hls_0" in hls_synt_res_est + assert "MVAU_hls_0" in hls_synt_res_est else: - assert "MatrixVectorActivation_rtl_0" in hls_synt_res_est + assert "MVAU_rtl_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] + node = model.get_nodes_by_op_type("MVAU")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py index 2ff7dd8b32..1bc2d9d59e 100644 --- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py +++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py @@ -38,6 +38,7 @@ res_estimation, res_estimation_complete, ) +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def check_two_dict_for_equality(dict1, dict2): @@ -68,7 +69,7 @@ def test_res_estimate(): node_inp_list = ["inp", "weights", "thresh"] FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -95,10 +96,11 @@ def test_res_estimate(): model.set_tensor_datatype("outp", odt) model.set_tensor_datatype("weights", wdt) + model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) prod_resource_estimation = model.analysis(res_estimation) expect_resource_estimation = { - "MatrixVectorActivation_0": { + "MVAU_hls_0": { "BRAM_18K": 0, "BRAM_efficiency": 1, "LUT": 317, @@ -115,7 +117,7 @@ def test_res_estimate(): prod_resource_estimation = model.analysis(res_estimation_complete) expect_resource_estimation = { - "MatrixVectorActivation_0": [ + "MVAU_hls_0": [ { "BRAM_18K": 0, "BRAM_efficiency": 1, diff --git a/tests/fpgadataflow/test_minimize_bit_width.py b/tests/fpgadataflow/test_minimize_bit_width.py index 0e704230e7..2b765610ab 100644 --- a/tests/fpgadataflow/test_minimize_bit_width.py +++ b/tests/fpgadataflow/test_minimize_bit_width.py @@ -36,7 +36,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, roundup_to_integer_multiple from typing import Optional, Union -from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation from finn.transformation.fpgadataflow.minimize_accumulator_width import ( MinimizeAccumulatorWidth, @@ -68,7 +68,7 @@ def make_unit_test_model(wdt: DataType, idt: DataType, tdt: Optional[DataType] = noActivation=0 if tdt is not None else 1, ) layer2 = helper.make_node( - "MatrixVectorActivation", + "MVAU", ["hid", "params1", "thresh1"] if tdt is not None else ["hid", "params1"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -170,7 +170,7 @@ def test_minimize_weight_bit_width(wdt: DataType, rww: bool): # If runtime-writeable weights, specify as a node attribute for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)): + if isinstance(inst, (MVAU, VectorVectorActivation)): inst.set_nodeattr("runtime_writeable_weights", int(rww)) # Apply the optimization @@ -179,14 +179,14 @@ def test_minimize_weight_bit_width(wdt: DataType, rww: bool): # Iterate through each node to make sure it functioned properly for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)): + if isinstance(inst, (MVAU, VectorVectorActivation)): cur_wdt = DataType[inst.get_nodeattr("weightDataType")] exp_wdt = def_wdt if rww else wdt assert cur_wdt.bitwidth() == exp_wdt.bitwidth(), "Mismatched data types" def calculate_accumulator_bit_width( - inst: Union[MatrixVectorActivation, VectorVectorActivation], model: ModelWrapper + inst: Union[MVAU, VectorVectorActivation], model: ModelWrapper ) -> Union[DataType, IntType]: """Calculate the accumulator bit width using the closed-form expressions derived in `Quantized Neural Networks for Low-Precision Accumulation @@ -206,7 +206,7 @@ def phi(x: float) -> float: if inst.get_nodeattr("binaryXnorMode"): weights = 2 * weights - 1 # modify the weights based on if the node is a VVAU or MVAU - if isinstance(inst, MatrixVectorActivation): + if isinstance(inst, MVAU): K = inst.get_nodeattr("MW") # matrix_width = num_inputs elif isinstance(inst, VectorVectorActivation): k_h, k_w = inst.get_nodeattr("Kernel") @@ -275,7 +275,7 @@ def test_minimize_accumulator_width(wdt: DataType, idt: DataType, tdt: DataType, # If runtime-writeable weights, specify as a node attribute for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)): + if isinstance(inst, (MVAU, VectorVectorActivation)): inst.set_nodeattr("runtime_writeable_weights", int(rww)) cur_adt = DataType[inst.get_nodeattr("accDataType")] assert cur_adt.bitwidth() == def_adt.bitwidth(), "Default data type is incorrect" @@ -286,7 +286,7 @@ def test_minimize_accumulator_width(wdt: DataType, idt: DataType, tdt: DataType, # Iterate through each node to make sure it functioned properly for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MatrixVectorActivation, VectorVectorActivation)): + if isinstance(inst, (MVAU, VectorVectorActivation)): cur_adt = DataType[inst.get_nodeattr("accDataType")] cur_odt = DataType[inst.get_nodeattr("outputDataType")] # Calculating expected accumulator bit width using a closed-form expression diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py index 4992bf59f8..19e459c222 100644 --- a/tests/fpgadataflow/test_set_folding.py +++ b/tests/fpgadataflow/test_set_folding.py @@ -64,7 +64,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes): simd = 1 FCLayer_nodes += [ helper.make_node( - "MatrixVectorActivation_hls", + "MVAU_hls", [tensors[i].name, "weights_" + str(i), "thresh_" + str(i)], [tensors[i + 1].name], domain="finn.custom_op.fpgadataflow.hls", diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py index 653e1e7896..d4901c92ce 100644 --- a/tests/fpgadataflow/test_split_large_fifos.py +++ b/tests/fpgadataflow/test_split_large_fifos.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -55,7 +55,7 @@ def get_folding_cfg(depth=65536): cfg = dict() cfg["Defaults"] = dict() for i in range(4): - key = "StreamingFIFO_" + str(i) + key = "StreamingFIFO_rtl_" + str(i) cfg[key] = {"depth": depth, "ram_style": "auto", "impl_style": "vivado"} return cfg @@ -98,7 +98,7 @@ def test_split_large_fifos(depth, force_python_rtlsim): ) model = ModelWrapper(tmp_output_dir + "/intermediate_models/step_set_fifo_depths.onnx") # exclude final FIFO node (output FIFO, not part of test) - fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")[:-1] + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO_rtl")[:-1] golden_cfg = get_fifo_split_configs(depth, 256, 32768) for i, fifo_node in enumerate(fifo_nodes): inst = getCustomOp(fifo_node) From b99035a3cea8d2470b1b6aa834953d7b39760c14 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 19 Feb 2024 10:51:52 +0000 Subject: [PATCH 138/291] [MVAU/Tests] Change rtlsim function in MVAU execute node --- .../fpgadataflow/hls/matrixvectoractivation_hls.py | 5 ++--- tests/fpgadataflow/test_fpgadataflow_mvau.py | 10 +++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index c6ca66e15d..e279d3953a 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -29,7 +29,6 @@ import math import numpy as np import os -from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend @@ -555,8 +554,8 @@ def execute_node(self, context, graph): sim = self.get_rtlsim() nbits = self.get_instream_width() inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - reset_rtlsim(sim) - toggle_clk(sim) + self.reset_rtlsim(sim) + self.toggle_clk(sim) if mem_mode == "external" or mem_mode == "decoupled": wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index f202b094e8..d10b560191 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -153,7 +153,7 @@ def prepare_inputs(input_tensor, idt, wdt): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_hwop(idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_mvau_hwop(idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: @@ -236,7 +236,7 @@ def test_fpgadataflow_fclayer_hwop(idt, wdt, act, nf, sf, mw, mh): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_mvau_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: @@ -329,7 +329,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: @@ -431,7 +431,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( +def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( mem_mode, idt, wdt, act, nf, sf, mw, mh, backend ): if nf == -1: @@ -538,7 +538,7 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( @pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend): +def test_mvau_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend): if nf == -1: nf = mh if sf == -1: From e29485a0133d0ef271026a9e8505940575b36241 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 19 Feb 2024 14:05:36 +0000 Subject: [PATCH 139/291] [Tests] Change tests to use new op type for MVAU --- tests/fpgadataflow/test_runtime_weights.py | 2 +- tests/transformation/test_infer_data_layouts_cnv.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py index 0f0d88dd35..32534d4aa5 100644 --- a/tests/fpgadataflow/test_runtime_weights.py +++ b/tests/fpgadataflow/test_runtime_weights.py @@ -71,7 +71,7 @@ def test_runtime_weights_single_layer(): layer_spec_list = [layer_spec] model = hls_random_mlp_maker(layer_spec_list) model = model.transform(SpecializeLayers()) - fcl = model.get_nodes_by_op_type("MatrixVectorActivation_hls")[0] + fcl = model.get_nodes_by_op_type("MVAU_hls")[0] op_inst = getCustomOp(fcl) op_inst.set_nodeattr("mem_mode", "decoupled") op_inst.set_nodeattr("runtime_writeable_weights", 1) diff --git a/tests/transformation/test_infer_data_layouts_cnv.py b/tests/transformation/test_infer_data_layouts_cnv.py index 6b6674d661..fc9d98d24f 100644 --- a/tests/transformation/test_infer_data_layouts_cnv.py +++ b/tests/transformation/test_infer_data_layouts_cnv.py @@ -116,9 +116,9 @@ def test_infer_data_layouts_cnv(): # since the concept of channels changes with lowering... but it is # conceptually close to NHWC since the innermost dim gets multiplied assert model.get_tensor_layout("ConvolutionInputGenerator_0_out0") == DataLayout.NHWC - assert model.get_tensor_layout("MatrixVectorActivation_3_out0") == DataLayout.NHWC + assert model.get_tensor_layout("MVAU_3_out0") == DataLayout.NHWC assert model.get_tensor_layout("Reshape_0_out0") == DataLayout.NC - assert model.get_tensor_layout("MatrixVectorActivation_6_out0") == DataLayout.NC + assert model.get_tensor_layout("MVAU_6_out0") == DataLayout.NC assert model.get_tensor_layout("global_out") == DataLayout.NC os.remove(export_onnx_path_cnv) From 7429ee607c688e45589333b055cc68f6c0d898b3 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Tue, 20 Feb 2024 11:07:02 +0000 Subject: [PATCH 140/291] [CustomOp] Zero Pad threshold weights file between channel folds --- .../fpgadataflow/rtl/thresholding_rtl.py | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 54797e1b94..26cba23620 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -753,6 +753,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): # TODO add flips/reversals as needed here # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps) pe = self.get_nodeattr("PE") + ch = self.get_nodeattr("NumChannels") n_thres_steps = self.get_nodeattr("numSteps") decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2) decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps) @@ -762,21 +763,32 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): ) decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy() width_padded = roundup_to_integer_multiple(weights.shape[1], 4) - # # zero pad the row weight_padded = np.zeros((weights.shape[0],width_padded)) weight_padded[:weights.shape[0], :n_thres_steps ] = weights weight_stream = [] - for channel in weight_padded: - for weight in channel: - wdt = self.get_weight_datatype() - bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32) - weight_stream.append(pack_innermost_dim_as_hex_string( - [weight], wdt, bw_hexdigit, prefix="" - ).item()) - + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32) + padding = np.zeros(width_padded, dtype=np.int32) + + chan_ind = 0 + cf = ch//pe + for fold in range(cf): + for c in range(2**pe.bit_length()): + if (c==0 or c%pe != 0) and c < pe: + for w in weight_padded[chan_ind]: + w_packed = pack_innermost_dim_as_hex_string( + [w], wdt, bw_hexdigit, prefix="" + ).item() + weight_stream.append(w_packed) + chan_ind +=1 + else: + for z in padding: + w_packed = pack_innermost_dim_as_hex_string( + [z], wdt, bw_hexdigit, prefix="" + ).item() + weight_stream.append(w_packed) with open(weight_file_name, "w") as f: for val in weight_stream: f.write(val + "\n") - else: raise Exception("Unknown weight_file_mode") \ No newline at end of file From b8b7bafa992c75e80e1bb333e3b3d8ee45c06312 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 20 Feb 2024 11:45:48 +0000 Subject: [PATCH 141/291] [Tests] Fix MVAU test with large depth decoupled mode --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index d10b560191..216b0f2937 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -427,12 +427,10 @@ def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.parametrize("mw", [128]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [128]) -# Backend -@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( - mem_mode, idt, wdt, act, nf, sf, mw, mh, backend + mem_mode, idt, wdt, act, nf, sf, mw, mh ): if nf == -1: nf = mh @@ -504,12 +502,9 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - if backend == "hls": - assert "MVAU_hls_0" in hls_synt_res_est - else: - assert "MVAU_rtl_0" in hls_synt_res_est + assert "MVAU_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("MVAU")[0] + node = model.get_nodes_by_op_type("MVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) @@ -534,11 +529,9 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( @pytest.mark.parametrize("mw", [32]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [32]) -# Backend -@pytest.mark.parametrize("backend", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_mvau_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, backend): +def test_mvau_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: From 8220852b4d82cdd3e1e71bb1e7e223bbbcbdde19 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 20 Feb 2024 13:27:24 +0000 Subject: [PATCH 142/291] [NB] First cleanup over notebooks --- notebooks/advanced/0_custom_analysis_pass.ipynb | 4 ++-- notebooks/advanced/1_custom_transformation_pass.ipynb | 8 ++++---- notebooks/advanced/2_custom_op.ipynb | 2 +- notebooks/basics/0_how_to_work_with_onnx.ipynb | 4 ++-- .../basics/1_brevitas_network_import_via_QONNX.ipynb | 6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/notebooks/advanced/0_custom_analysis_pass.ipynb b/notebooks/advanced/0_custom_analysis_pass.ipynb index f915b11fa0..5ed48ca6d8 100644 --- a/notebooks/advanced/0_custom_analysis_pass.ipynb +++ b/notebooks/advanced/0_custom_analysis_pass.ipynb @@ -153,9 +153,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/advanced/1_custom_transformation_pass.ipynb b/notebooks/advanced/1_custom_transformation_pass.ipynb index 7e4989c902..91dd925b25 100644 --- a/notebooks/advanced/1_custom_transformation_pass.ipynb +++ b/notebooks/advanced/1_custom_transformation_pass.ipynb @@ -212,7 +212,7 @@ "\n", "To control the degree of parallelization the argument `num_workers` can be specified. When the Docker container is started, the env variable `NUM_DEFAULT_WORKERS` is set to 4 by default, this can be increased or decreased depending on the system. You can also set the number of workers manually to a specific value when calling a transformation that allows parallelization. If the value is set to 0, all available CPU cores are used.\n", "\n", - "In the following we want to take a closer look at the implementation using the compile transformation as example." + "In the following we want to take a closer look at the implementation using the compile transformation that is used for cpp simulation as example." ] }, { @@ -230,7 +230,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The class is derived from the NodeLocalTransformation class and performs the compilation at every node that is fpgadataflow node." + "The class is derived from the NodeLocalTransformation class and performs the compilation at every node that is an hls node." ] } ], @@ -250,9 +250,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb index 636da64dd5..bdd2976412 100644 --- a/notebooks/advanced/2_custom_op.ipynb +++ b/notebooks/advanced/2_custom_op.ipynb @@ -672,7 +672,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/basics/0_how_to_work_with_onnx.ipynb b/notebooks/basics/0_how_to_work_with_onnx.ipynb index 35a83ea97b..f1b3dcf68b 100644 --- a/notebooks/basics/0_how_to_work_with_onnx.ipynb +++ b/notebooks/basics/0_how_to_work_with_onnx.ipynb @@ -613,9 +613,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb index f15f716e7f..5c2f10310f 100644 --- a/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb +++ b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb @@ -177,7 +177,7 @@ "source": [ "## 3. Import into FINN and converting QONNX to FINN-ONNX\n", "\n", - "Similarily to the 1a notebook we will first run a cleanup transformation on the exported QONNX model." + "We will first run a cleanup transformation on the exported QONNX model." ] }, { @@ -318,9 +318,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From c5ca1285e8b613f38930eb3b91d7039750f9c9d0 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 20 Feb 2024 15:10:19 +0000 Subject: [PATCH 143/291] [NB] Update cybersec notebooks --- .../cybersecurity/1-train-mlp-with-brevitas.ipynb | 2 +- .../cybersecurity/2-import-into-finn-and-verify.ipynb | 2 +- .../cybersecurity/3-build-accelerator-with-finn.ipynb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb index 7644173284..da037050bb 100644 --- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb +++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb @@ -769,7 +769,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb index a5bc165573..33b64e11c0 100644 --- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb +++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb @@ -399,7 +399,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb index 80f3cd3819..5e8bff3e04 100644 --- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb +++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb @@ -659,7 +659,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, From 0928d31039107b57c69b6bf0144be0d2939077bc Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 20 Feb 2024 17:47:24 +0000 Subject: [PATCH 144/291] [test]: added extra tests to RTL-based MVAU --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 139 ++++++++++++++++++- 1 file changed, 135 insertions(+), 4 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index d10b560191..85cca66835 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -26,6 +26,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os +import pickle import pytest import numpy as np @@ -35,7 +37,7 @@ from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames, ApplyConfig from qonnx.util.basic import ( calculate_signed_dot_prod_range, gen_finn_dt_tensor, @@ -53,6 +55,9 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None): @@ -128,12 +133,30 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non return model -def prepare_inputs(input_tensor, idt, wdt): +def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W): + matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"]) + graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm]) + + model = qonnx_make_model(graph, producer_name="fclayer-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("ifm", idt) + model.set_tensor_datatype("weights", wdt) + model.set_tensor_datatype( + "ofm", DataType["INT32"] + ) # At this step, the MatMul layer does not optimize the bit-width of the output datatype + model.set_initializer("weights", W) + # model.set_tensor_layout("ifm", DataLayout.NHWC) + + return model + + +def prepare_inputs(input_tensor, idt, wdt, inp_name="inp"): if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: # convert bipolar to binary - return {"inp": (input_tensor + 1) / 2} + return {inp_name: (input_tensor + 1) / 2} else: - return {"inp": input_tensor} + return {inp_name: input_tensor} # activation: None or DataType @@ -370,6 +393,7 @@ def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("rtlsim_trace", "mvau_trace.vcd") # prepare input data input_dict = prepare_inputs(x, idt, wdt) @@ -396,6 +420,7 @@ def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): model = model.transform(PrepareIP("xc7z020clg400-1", 5)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) + model.save("mvau_rtl.onnx") y_produced = oxe.execute_onnx(model, input_dict)["outp"] assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" @@ -583,3 +608,109 @@ def test_mvau_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, b assert (chrc_in[0, :sf] == range(1, sf + 1)).all() # all outputs should be produced within the exp n of cycles assert chrc_out[0, exp_total_cycles] == nf + + +# @pytest.mark.parametrize("mh", [36]) +# @pytest.mark.parametrize("mw", [256]) +@pytest.mark.parametrize("mh", [1]) +@pytest.mark.parametrize("mw", [8]) +# @pytest.mark.parametrize("pe", [1, 4, 9, 36]) +# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256]) +# @pytest.mark.parametrize("pe", [1, 3, 9]) +# @pytest.mark.parametrize("simd", [1, 3, 6, 18, 36]) +@pytest.mark.parametrize("pe", [1]) +@pytest.mark.parametrize("simd", [4]) +# @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) +# @pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]]) +@pytest.mark.parametrize("idt", [DataType["UINT4"]]) +@pytest.mark.parametrize("wdt", [DataType["INT4"]]) +# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"]) +@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"]) +# @pytest.mark.parametrize("clk_ns", [1.66, 4]) +@pytest.mark.parametrize("clk_ns", [4]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_rtl_mvau( + mh, mw, pe, simd, idt, wdt, part, clk_ns +): + if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66: + pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test") + + build_dir = os.environ["FINN_BUILD_DIR"] + # Create test input vector (produced by SWG) + ofm_shape = (2, 2) + ofm_h, ofm_w = ofm_shape + ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw]) + ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh)) + W = gen_finn_dt_tensor(wdt, (mw, mh)) + model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + model.save(build_dir + "/matmul.onnx") + + # Create MatMul & obtain golden reference output + A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")) + input_dict = prepare_inputs(A, idt, wdt, inp_name="global_in") + + # Execute ONNX model + output_matmul = oxe.execute_onnx(model, input_dict)["global_out"] + + with open(build_dir + "/onnx_output.pkl", "wb") as f: + pickle.dump(output_matmul, f) + + # Create MVAU (HLS) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) + model = model.transform(GiveUniqueNodeNames()) + + # Apply folding (i.e. specify to use DSPs) + folding_config = { + "Defaults": {}, + "MVAU_0": { + "PE": pe, + "SIMD": simd, + "mem_mode": "decoupled", + "ram_style": "auto", + "resType": "dsp", + "preferred_impl_style" : "rtl" + }, + } + model = model.transform(ApplyConfig(folding_config)) + model.save(build_dir + "/mvau_hls.onnx") + + # Apply convert-to-rtl step + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + model.save(build_dir + "/mvau_rtl.onnx") + + # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated + for n in model.graph.node: + getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd") + + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"] + + with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f: + pickle.dump(output_mvau_rtl, f) + + model.save(build_dir + "/mvau_rtl_sim.onnx") + import pdb; pdb.set_trace() + assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!" + + model = model.transform(InsertAndSetFIFODepths(part, clk_ns)) + model = model.transform(PrepareIP(part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(part, clk_ns)) + + os.environ["RTLSIM_TRACE_DEPTH"] = "3" + model.set_metadata_prop("rtlsim_so", "") + model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd") + model.save(build_dir + "/stitched_ip.onnx") + output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"] + + assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" \ No newline at end of file From ef8157c06c4d5d771c9aa9d4bd527ec09c898ad5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20B=2E=20Preu=C3=9Fer?= Date: Wed, 21 Feb 2024 14:36:56 +0000 Subject: [PATCH 145/291] Reply to readbacks from padded memory areas. --- finn-rtllib/thresholding/hdl/thresholding.sv | 26 +++++++++++++++---- .../thresholding/sim/thresholding_tb.sv | 2 +- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index dc612f387f..4c83c8e9db 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -146,16 +146,32 @@ module thresholding #( end // PE Configuration Address Decoding - uwire cfg_sel[PE]; - if(PE == 1) assign cfg_sel[0] = 1; + logic cfg_sel[PE]; + logic cfg_oob; + logic [N-1:0] cfg_ofs; + if(PE == 1) begin + assign cfg_sel[0] = 1; + assign cfg_oob = 0; + assign cfg_ofs = cfg_a[0+:N]; + end else begin - for(genvar pe = 0; pe < PE; pe++) begin - assign cfg_sel[pe] = USE_CONFIG && cfg_en && (cfg_a[N+:$clog2(PE)] == pe); + uwire [$clog2(PE)-1:0] cfg_pe = cfg_a[N+:$clog2(PE)]; + always_comb begin + foreach(cfg_sel[pe]) begin + cfg_sel[pe] = USE_CONFIG && cfg_en && (cfg_pe == pe); + end + cfg_oob = (cfg_pe >= PE); + cfg_ofs = cfg_a[0+:N]; + if(cfg_oob && !cfg_we) begin + // Map readbacks from padded rows (non-existent PEs) to padded highest threshold index of first PE + cfg_sel[0] = 1; + cfg_ofs = '1; + end end end uwire ptr_t iptr; - assign iptr[0+:N] = cfg_a[0+:N]; + assign iptr[0+:N] = cfg_ofs; if(CF > 1) begin // Channel Fold Rotation logic [$clog2(CF)-1:0] CnlCnt = 0; diff --git a/finn-rtllib/thresholding/sim/thresholding_tb.sv b/finn-rtllib/thresholding/sim/thresholding_tb.sv index e42145f10e..3f4ca61a85 100644 --- a/finn-rtllib/thresholding/sim/thresholding_tb.sv +++ b/finn-rtllib/thresholding/sim/thresholding_tb.sv @@ -196,7 +196,7 @@ module thresholding_tb #( end join_any done <= 1; - repeat((DEEP_PIPELINE+1)*N+6) @(posedge clk); + repeat((DEEP_PIPELINE+1)*N+8) @(posedge clk); assert(QW.size() == 0) else begin $error("[%0d] Missing %0d outputs.", i, QW.size()); From a395fc7d35e7410693edfb5253c91b83dfe054fb Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 21 Feb 2024 18:16:33 +0000 Subject: [PATCH 146/291] [Transform/Analysis] Cleanup usage of is_fpgadataflow_node --- .../fpgadataflow/dataflow_performance.py | 9 ++-- .../fpgadataflow/exp_cycles_per_layer.py | 7 +-- .../analysis/fpgadataflow/floorplan_params.py | 3 +- .../fpgadataflow/hls_synth_res_estimation.py | 4 +- .../analysis/fpgadataflow/post_synth_res.py | 7 +-- .../fpgadataflow/annotate_cycles.py | 7 +-- .../fpgadataflow/annotate_resources.py | 7 +-- .../transformation/fpgadataflow/cleanup.py | 5 +- .../fpgadataflow/compile_cppsim.py | 4 +- .../fpgadataflow/derive_characteristic.py | 6 +-- .../fpgadataflow/hlssynth_ip.py | 6 +-- .../transformation/fpgadataflow/insert_dwc.py | 2 +- .../fpgadataflow/insert_fifo.py | 7 +-- .../fpgadataflow/insert_hook.py | 6 +-- .../minimize_accumulator_width.py | 5 +- .../fpgadataflow/minimize_weight_bit_width.py | 4 +- .../fpgadataflow/prepare_cppsim.py | 4 +- .../transformation/fpgadataflow/prepare_ip.py | 18 +++++-- .../fpgadataflow/prepare_rtlsim.py | 7 +-- .../fpgadataflow/replace_verilog_relpaths.py | 7 +-- .../fpgadataflow/set_exec_mode.py | 51 ++++++++++--------- .../fpgadataflow/set_fifo_depths.py | 6 ++- .../fpgadataflow/set_folding.py | 7 +-- src/finn/transformation/move_reshape.py | 23 ++------- 24 files changed, 113 insertions(+), 99 deletions(-) diff --git a/src/finn/analysis/fpgadataflow/dataflow_performance.py b/src/finn/analysis/fpgadataflow/dataflow_performance.py index 824690f5f6..a4bf40760e 100644 --- a/src/finn/analysis/fpgadataflow/dataflow_performance.py +++ b/src/finn/analysis/fpgadataflow/dataflow_performance.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,7 +29,7 @@ from qonnx.custom_op.registry import getCustomOp -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def dataflow_performance(model): @@ -38,7 +39,7 @@ def dataflow_performance(model): for each node along the critical path. Preconditions: - - model consists of fpgadataflow nodes + - model consists of HLS/RTL nodes - model has cycle estimates annotated (see AnnotateCycles transformation) - nodes have unique names (see GiveUniqueNodeNames) @@ -52,7 +53,7 @@ def dataflow_performance(model): max_node_name = "" for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): inst = getCustomOp(node) node_cycles = int(inst.get_nodeattr("cycles_estimate")) if node_cycles > max_cycles: diff --git a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py index e1517ec636..50585720fe 100644 --- a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py +++ b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,7 +29,7 @@ import qonnx.custom_op.registry as registry -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def exp_cycles_per_layer(model): @@ -41,7 +42,7 @@ def exp_cycles_per_layer(model): cycle_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): inst = registry.getCustomOp(node) cycle_dict[node.name] = int(inst.get_exp_cycles()) diff --git a/src/finn/analysis/fpgadataflow/floorplan_params.py b/src/finn/analysis/fpgadataflow/floorplan_params.py index d57b660bce..be03966fb9 100644 --- a/src/finn/analysis/fpgadataflow/floorplan_params.py +++ b/src/finn/analysis/fpgadataflow/floorplan_params.py @@ -1,4 +1,5 @@ # Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,7 +46,7 @@ def floorplan_params(model): } } for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): node_inst = getCustomOp(node) node_slr = node_inst.get_nodeattr("slr") node_pid = node_inst.get_nodeattr("partition_id") diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py index cd6b322727..330494315a 100644 --- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py +++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py @@ -30,7 +30,7 @@ import warnings import xml.etree.ElementTree as ET -from finn.util.fpgadataflow import is_fpgadataflow_node, is_hls_node +from finn.util.fpgadataflow import is_hls_node def hls_synth_res_estimation(model): @@ -44,7 +44,7 @@ def hls_synth_res_estimation(model): res_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) and is_hls_node(node): + if is_hls_node(node): # init values to zero res_dict[node.name] = dict() res_dict[node.name]["BRAM_18K"] = 0 diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py index 3304b88d60..7b65b60fa7 100644 --- a/src/finn/analysis/fpgadataflow/post_synth_res.py +++ b/src/finn/analysis/fpgadataflow/post_synth_res.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +32,7 @@ from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp -from finn.transformation.move_reshape import _is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def post_synth_res(model, override_synth_report_filename=None): @@ -102,7 +103,7 @@ def get_instance_stats(inst_name): sdp_model = ModelWrapper(getCustomOp(node).get_nodeattr("model")) sdp_res_dict = post_synth_res(sdp_model, synth_report_filename) res_dict.update(sdp_res_dict) - elif _is_fpgadataflow_node(node): + elif is_hls_node(node) or is_rtl_node(node): node_dict = get_instance_stats(node.name) if node_dict is not None: res_dict[node.name] = node_dict diff --git a/src/finn/transformation/fpgadataflow/annotate_cycles.py b/src/finn/transformation/fpgadataflow/annotate_cycles.py index 7befad7aa7..6646434bdf 100644 --- a/src/finn/transformation/fpgadataflow/annotate_cycles.py +++ b/src/finn/transformation/fpgadataflow/annotate_cycles.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +32,7 @@ from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation -from finn.transformation.move_reshape import _is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class AnnotateCycles(Transformation): @@ -46,7 +47,7 @@ def apply(self, model): graph = model.graph # annotate node cycles for node in graph.node: - if _is_fpgadataflow_node(node): + if is_hls_node(node) or is_rtl_node(node): op_inst = registry.getCustomOp(node) cycles = op_inst.get_exp_cycles() op_inst.set_nodeattr("cycles_estimate", cycles) diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py index bb5637f7d3..f07a5186d5 100644 --- a/src/finn/transformation/fpgadataflow/annotate_resources.py +++ b/src/finn/transformation/fpgadataflow/annotate_resources.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +35,7 @@ from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation from finn.analysis.fpgadataflow.post_synth_res import post_synth_res from finn.analysis.fpgadataflow.res_estimation import res_estimation -from finn.transformation.move_reshape import _is_fpgadataflow_node +from finn.util.fpgadataflow import is_fpgadataflow_node class AnnotateResources(Transformation): @@ -68,7 +69,7 @@ def apply(self, model): children_dict = {} # annotate node resources for node in graph.node: - if _is_fpgadataflow_node(node) and node.name in self.res_dict.keys(): + if is_fpgadataflow_node(node) and node.name in self.res_dict.keys(): op_inst = registry.getCustomOp(node) op_inst.set_nodeattr("res_" + self.mode, str(self.res_dict[node.name])) children_dict[node.name] = self.res_dict[node.name] diff --git a/src/finn/transformation/fpgadataflow/cleanup.py b/src/finn/transformation/fpgadataflow/cleanup.py index 398580c48e..907b65eb9d 100644 --- a/src/finn/transformation/fpgadataflow/cleanup.py +++ b/src/finn/transformation/fpgadataflow/cleanup.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -53,7 +54,7 @@ def apply(self, model): model.set_metadata_prop("vivado_stitch_proj", "") for node in model.graph.node: op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/compile_cppsim.py b/src/finn/transformation/fpgadataflow/compile_cppsim.py index 4814b24a92..6190560265 100644 --- a/src/finn/transformation/fpgadataflow/compile_cppsim.py +++ b/src/finn/transformation/fpgadataflow/compile_cppsim.py @@ -30,7 +30,7 @@ import qonnx.custom_op.registry as registry from qonnx.transformation.base import NodeLocalTransformation -from finn.util.fpgadataflow import is_fpgadataflow_node, is_hls_node +from finn.util.fpgadataflow import is_hls_node class CompileCppSim(NodeLocalTransformation): @@ -51,7 +51,7 @@ def __init__(self, num_workers=None): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) and is_hls_node(node): + if is_hls_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py index dee9b62e67..4d3ac7dc67 100644 --- a/src/finn/transformation/fpgadataflow/derive_characteristic.py +++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py @@ -33,7 +33,7 @@ from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.base import NodeLocalTransformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class DeriveCharacteristic(NodeLocalTransformation): @@ -59,7 +59,7 @@ def __init__(self, period, num_workers=None, manual_bypass=False): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) @@ -131,7 +131,7 @@ def __init__(self, num_workers=None, io_fifo_depth=32): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps prod = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py index daf64656b5..5b901d9284 100644 --- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py +++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py @@ -32,11 +32,11 @@ import warnings from qonnx.transformation.base import NodeLocalTransformation -from finn.util.fpgadataflow import is_fpgadataflow_node, is_hls_node +from finn.util.fpgadataflow import is_hls_node class HLSSynthIP(NodeLocalTransformation): - """For each node: generate IP block from code in folder + """For each HLS node: generate IP block from code in folder that is referenced in node attribute "code_gen_dir_ipgen" and save path of generated project in node attribute "ipgen_path". All nodes in the graph must have the fpgadataflow backend attribute. @@ -55,7 +55,7 @@ def __init__(self, num_workers=None): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) and is_hls_node(node): + if is_hls_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 100beefcc2..96c114498c 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -43,7 +43,7 @@ def _is_dwc_node(node): def _suitable_node(node): if node is not None: - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): if _is_dwc_node(node): # no DWC for DWCs return False diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index 630310842c..9df193efcf 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,8 +46,8 @@ def _is_fifo_node(node): def _suitable_node(node): if node is not None: - if is_fpgadataflow_node(node) is True: - if _is_fifo_node(node) is False: + if is_fpgadataflow_node(node): + if not _is_fifo_node(node): return True else: return False diff --git a/src/finn/transformation/fpgadataflow/insert_hook.py b/src/finn/transformation/fpgadataflow/insert_hook.py index 23b60d6812..843a32a73e 100644 --- a/src/finn/transformation/fpgadataflow/insert_hook.py +++ b/src/finn/transformation/fpgadataflow/insert_hook.py @@ -34,7 +34,7 @@ from qonnx.transformation.base import Transformation from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def _is_hook_node(node): @@ -46,8 +46,8 @@ def _is_hook_node(node): def _suitable_node(node): if node is not None: - if is_fpgadataflow_node(node) is True: - if _is_hook_node(node) is False: + if is_hls_node(node) or is_rtl_node(node): + if not _is_hook_node(node): return True else: return False diff --git a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py index 8d04d5b817..61159fde0c 100644 --- a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py +++ b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,7 +47,7 @@ def apply(self, model): # Since InferDataTypes potentially changes node attributes in each loop iterations, # the for-loop cannot loop over a list of a snapshot of the graph's node protos node = model.graph.node[node_id] - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): inst = getCustomOp(node) if hasattr(inst, "minimize_accumulator_width"): inst.minimize_accumulator_width(model) diff --git a/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py b/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py index 32871cc44a..49770f7d0c 100644 --- a/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py +++ b/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py @@ -1,4 +1,4 @@ -# Copyright (C) 2023, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -42,7 +42,7 @@ def __init__(self): def apply(self, model): for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): inst = getCustomOp(node) if hasattr(inst, "minimize_weight_bit_width"): inst.minimize_weight_bit_width(model) diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py index 0b744b5f4f..d4cc6dcc99 100644 --- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py +++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py @@ -35,7 +35,7 @@ from qonnx.util.basic import get_num_default_workers from finn.util.basic import make_build_dir -from finn.util.fpgadataflow import is_fpgadataflow_node, is_hls_node +from finn.util.fpgadataflow import is_hls_node def _codegen_single_node(node, model): @@ -79,7 +79,7 @@ def __init__(self, num_workers=None): self._num_workers = mp.cpu_count() def prepareCppSim_node(self, node): - if is_fpgadataflow_node(node) and is_hls_node(node): + if is_hls_node(node): _codegen_single_node(node, self.model) return (node, False) diff --git a/src/finn/transformation/fpgadataflow/prepare_ip.py b/src/finn/transformation/fpgadataflow/prepare_ip.py index 5461bbd77c..a74e0f7afc 100644 --- a/src/finn/transformation/fpgadataflow/prepare_ip.py +++ b/src/finn/transformation/fpgadataflow/prepare_ip.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,7 +33,7 @@ from qonnx.transformation.base import Transformation from finn.util.basic import make_build_dir -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def _codegen_single_node(node, model, fpgapart, clk): @@ -72,8 +73,15 @@ class PrepareIP(Transformation): will be skipped. Outcome if succesful: Node attribute "code_gen_dir_ipgen" contains path to folder - that contains generated C++ code that can be used to generate a Vivado IP block. - The subsequent transformation is HLSSynthIP""" + that contains: + + * For HLS layers: generated C++ code that can be used to generate a Vivado IP block. + The necessary subsequent transformation is HLSSynthIP. + + * For RTL layers: filled template verilog files that can be used to instantiate as + module during IP stitching. + + """ def __init__(self, fpgapart, clk): super().__init__() @@ -82,6 +90,6 @@ def __init__(self, fpgapart, clk): def apply(self, model): for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): _codegen_single_node(node, model, self.fpgapart, self.clk) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py index 8ba7cfd965..b8f45deb1d 100644 --- a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py +++ b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,7 +33,7 @@ from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node try: from pyverilator import PyVerilator @@ -63,7 +64,7 @@ def apply(self, model): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py index 4e7970caa0..de13166e73 100644 --- a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py +++ b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,7 +31,7 @@ import qonnx.custom_op.registry as registry from qonnx.transformation.base import Transformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class ReplaceVerilogRelPaths(Transformation): @@ -41,7 +42,7 @@ def __init__(self): def apply(self, model): for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/set_exec_mode.py b/src/finn/transformation/fpgadataflow/set_exec_mode.py index 7df4451a22..1b5a510d2f 100644 --- a/src/finn/transformation/fpgadataflow/set_exec_mode.py +++ b/src/finn/transformation/fpgadataflow/set_exec_mode.py @@ -30,39 +30,44 @@ import qonnx.custom_op.registry as registry from qonnx.transformation.base import Transformation -from finn.util.fpgadataflow import is_fpgadataflow_node, is_rtl_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class SetExecMode(Transformation): """Set attribute exec_mode in all fpgadataflow nodes to specify which kind of execution should be used ("cppsim" or "rtlsim"). Note that RTL components do not support cppsim. - If cppsim is selected, only HLS components will be set for cppsim, - RTL components default in this case to rtlsim execution mode.""" + For now, only a model consisting of 100% of HLS layers can be executed + using cppsim.""" def __init__(self, mode): super().__init__() self.mode = mode def apply(self, model): - for node in model.graph.node: - op_type = node.op_type - if is_fpgadataflow_node(node): - if self.mode == "cppsim" and is_rtl_node(node): - mode = "rtlsim" - else: - mode = self.mode - try: - # lookup op_type in registry of CustomOps - inst = registry.getCustomOp(node) - # set sim_mode accordingly to argument mode - inst.set_nodeattr("exec_mode", mode) - # ensure that sim_mode is now set - assert ( - inst.get_nodeattr("exec_mode") != "" - ), """Transformation - was not successful. Node attribute "exec_mode" is not set""" - except KeyError: - # exception if op_type is not supported - raise Exception("Custom op_type %s is currently not supported." % op_type) + mode = self.mode + # if "cppsim" selected, check if model does not contain RTL layers + if mode == "cppsim" and any(is_rtl_node(node) for node in model.graph.node): + raise Exception( + """Model contains RTL layers, + cppsim can only be used on models consisting of HLS layers + and non fpgadataflow nodes.""" + ) + else: + for node in model.graph.node: + op_type = node.op_type + if is_hls_node(node) or is_rtl_node(node): + try: + # lookup op_type in registry of CustomOps + inst = registry.getCustomOp(node) + # set sim_mode accordingly to argument mode + inst.set_nodeattr("exec_mode", mode) + # ensure that sim_mode is now set + assert ( + inst.get_nodeattr("exec_mode") != "" + ), """Transformation + was not successful. Node attribute "exec_mode" is not set""" + except KeyError: + # exception if op_type is not supported + raise Exception("Custom op_type %s is currently not supported." % op_type) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 1e25670a71..d81f1fe247 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -49,7 +49,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node from finn.util.pyverilator import pyverilate_stitched_ip, verilator_fifosim @@ -265,7 +265,9 @@ def apply(self, model): modified_fc_nodes = [] for node in model.graph.node: # verify assumptions - assert is_fpgadataflow_node(node), "Found non-fpgadataflow node: " + str(node) + assert is_hls_node(node) or is_rtl_node(node), "Found non-fpgadataflow node: " + str( + node + ) op_type = node.op_type assert not op_type.startswith("StreamingFIFO"), "Found existing StreamingFIFO node" node = getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 28358fdacc..bff64d3885 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +35,7 @@ from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def divisors(num): @@ -120,7 +121,7 @@ def apply(self, model): # as explained in the SetFolding docstring depthwise_op_exceptions = ["VectorVectorActivation_hls", "Pool_hls"] for node in graph.node: - if not is_fpgadataflow_node(node): + if not (is_hls_node(node) or is_rtl_node(node)): continue op_type = node.op_type node_inst = getCustomOp(node) diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py index a13ecee80f..2e6639c5c6 100644 --- a/src/finn/transformation/move_reshape.py +++ b/src/finn/transformation/move_reshape.py @@ -1,22 +1,9 @@ import warnings from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation -from qonnx.util.basic import get_by_name, is_finn_op +from qonnx.util.basic import get_by_name - -def _is_fpgadataflow_node(node): - if node is not None: - if is_finn_op(node.domain): - n_backend = get_by_name(node.attribute, "backend") - if n_backend is None: - return False - backend_value = n_backend.s.decode("UTF-8") - if backend_value == "fpgadataflow": - return True - else: - return False - else: - return False +from finn.util.fpgadataflow import is_fpgadataflow_node class RemoveCNVtoFCFlatten(Transformation): @@ -34,10 +21,10 @@ def apply(self, model): oshape = model.get_tensor_shape(n.output[0]) if len(oshape) == 2 and ishape[0] == oshape[0]: producer = model.find_producer(n.input[0]) - if _is_fpgadataflow_node(producer) is True: + if is_fpgadataflow_node(producer): # standalone flatten, remove consumer = model.find_consumer(n.output[0]) - if _is_fpgadataflow_node(consumer) is True: + if is_fpgadataflow_node(consumer): graph_modified = True consumer.input[0] = n.input[0] graph.node.remove(n) @@ -48,7 +35,7 @@ def apply(self, model): perms = list(get_by_name(transp_node.attribute, "perm").ints) if perms == [0, 3, 1, 2]: producer = model.find_producer(transp_node.input[0]) - if _is_fpgadataflow_node(producer) is True: + if is_fpgadataflow_node(producer): consumer = model.find_consumer(n.output[0]) if consumer.op_type.startswith("MVAU"): fc_inst = getCustomOp(consumer) From 34716ba5fd02704bcf1a6bb3e0b4ad93f3d508c0 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 22 Feb 2024 10:37:31 +0000 Subject: [PATCH 147/291] [tests] only check hls model analysis on hls modules --- tests/fpgadataflow/test_fpgadataflow_thresholding.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index f1be5f89a7..e1e91038ef 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -227,8 +227,9 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert model.graph.node[0].name in hls_synt_res_est + if impl_style == "hls": + hls_synt_res_est = model.analysis(hls_synth_res_estimation) + assert model.graph.node[0].name in hls_synt_res_est node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") From 2bf40ca4f1da4ad5cd7a068e0d739b7ba691bf64 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 22 Feb 2024 10:39:00 +0000 Subject: [PATCH 148/291] [tests] increase folding config for threshold tests --- .../test_fpgadataflow_thresholding.py | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index e1e91038ef..49061e4c9e 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -239,11 +239,10 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ assert exp_cycles != 0 @pytest.mark.parametrize("impl_style", ["rtl", "hls"]) -@pytest.mark.parametrize("cf", [2]) -@pytest.mark.parametrize("ch", [6]) +@pytest.mark.parametrize("cfg", [(1,1), (6,2), (6,3), (8,2), (8,4)]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_read(impl_style,cf,ch): +def test_runtime_thresholds_read(impl_style,cfg): """ Read back threshold weights during runtime 1. Create random initial weights T @@ -251,13 +250,12 @@ def test_runtime_thresholds_read(impl_style,cf,ch): 3. Read back weights via AXI 4. Compare with initial weights T """ + ch = cfg[0] + pe = cfg[1] n_inp_vecs = [1, 2, 2] mem_mode = "decoupled" act = DataType["INT4"] idt = DataType["INT16"] - pe = ch // cf - assert ch % pe == 0 - odt = act n_steps = act.get_num_possible_values() - 1 np.random.seed(2) @@ -330,12 +328,11 @@ def read_weights(sim): # Validate the output is as expected assert (y == expected).all() -@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) -@pytest.mark.parametrize("cf", [8]) -@pytest.mark.parametrize("ch", [16]) +@pytest.mark.parametrize("impl_style", ["hls", "rtl"]) +@pytest.mark.parametrize("cfg", [(1,1), (6,2), (6,3), (8,2), (8,4)]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_write(impl_style,cf,ch): +def test_runtime_thresholds_write(impl_style,cfg): """ Write threshold weights during runtime 1. Create random initial weights T_init @@ -346,12 +343,13 @@ def test_runtime_thresholds_write(impl_style,cf,ch): 6. Compare T_write and T_read 7. Validate outputs with expected vectors """ + ch = cfg[0] + pe = cfg[1] + n_inp_vecs = [1, 2, 2] mem_mode = "decoupled" act = DataType["INT4"] idt = DataType["INT16"] - pe = ch // cf - assert ch % pe == 0 odt = act n_steps = act.get_num_possible_values() - 1 From c09005b37529c78489fec057a9480297151ca873 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 22 Feb 2024 10:39:34 +0000 Subject: [PATCH 149/291] [tests] rename threshold weight files for distributed testing --- .../test_fpgadataflow_thresholding.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 49061e4c9e..dfd14268e5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -276,10 +276,12 @@ def test_runtime_thresholds_read(impl_style,cfg): op_inst = getCustomOp(model.graph.node[0]) op_inst.set_nodeattr("runtime_writeable_weights", 1) - op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat") - with open("old_weights.dat", "r") as f: + + dat_fname = f"old_weights_{cfg}.dat" + op_inst.make_weight_file(T, "decoupled_runtime", dat_fname) + with open(dat_fname, "r") as f: old_weight_stream = f.read().strip() - os.remove("old_weights.dat") + os.remove(dat_fname) old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) # need to create stitched IP for runtime weight testing @@ -378,10 +380,11 @@ def test_runtime_thresholds_write(impl_style,cfg): # provide non-decreasing thresholds T_write = np.sort(T_write, axis=1) - op_inst.make_weight_file(T_write, "decoupled_runtime", "T_write.dat") - with open("T_write.dat", "r") as f: + dat_fname = f"T_write_{cfg}.dat" # distinguish fname per paramter for distributed testing + op_inst.make_weight_file(T_write, "decoupled_runtime", dat_fname) + with open(dat_fname, "r") as f: T_write_stream = f.read().strip() - os.remove("T_write.dat") + os.remove(dat_fname) T_write_stream = map(lambda x: int(x, 16), T_write_stream.split("\n")) T_write_stream = list(T_write_stream) @@ -402,8 +405,6 @@ def test_runtime_thresholds_write(impl_style,cfg): in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ch])) in_tensor = np.tile(in_tensor, (2, 1, 1, 1)) - # trace_file = "trace_wr_01.vcd" - # model.set_metadata_prop("rtlsim_trace",trace_file) exec_ctx_write = {"inp": in_tensor} def write_weights(sim): addr = 0 From 0f03e37668f9ff0cc0024852f872b8918fee6d6b Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 22 Feb 2024 10:40:56 +0000 Subject: [PATCH 150/291] [CustomOp] threshold stage loop starts from 0 --- src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 26cba23620..04a1815a32 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -773,7 +773,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): chan_ind = 0 cf = ch//pe for fold in range(cf): - for c in range(2**pe.bit_length()): + for c in range(2**(pe-1).bit_length()): if (c==0 or c%pe != 0) and c < pe: for w in weight_padded[chan_ind]: w_packed = pack_innermost_dim_as_hex_string( From c4e57da733bff97d63daa8692fc08a05f2d1ba59 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 22 Feb 2024 11:30:16 +0000 Subject: [PATCH 151/291] [tests] convert to hw test for thresholding layers --- tests/fpgadataflow/test_convert_to_hw_thresholding.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/fpgadataflow/test_convert_to_hw_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py index 685c955f4e..3f0487f9f7 100755 --- a/tests/fpgadataflow/test_convert_to_hw_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hw_thresholding.py @@ -35,6 +35,7 @@ from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes +from qonnx.custom_op.registry import getCustomOp from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers @@ -115,7 +116,7 @@ def make_single_multithresholding_modelwrapper( @pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) @pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6]) @pytest.mark.parametrize("num_input_channels", [16]) -@pytest.mark.parametrize("impl_style", ["hls"]) # TODO: add rtl later +@pytest.mark.parametrize("impl_style", ["hls", "rtl"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_convert_multithreshold_to_hardware( @@ -162,7 +163,10 @@ def test_convert_multithreshold_to_hardware( ) model = model.transform(InferThresholdingLayer()) + + node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", impl_style) model = model.transform(SpecializeLayers()) model = model.transform(InferShapes()) - # TODO functional verification assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) From 666356a862e922f913264f351495ce2d5d6f8400 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 23 Feb 2024 11:40:41 +0000 Subject: [PATCH 152/291] [CustomOp] Update copyright headers for thresholding --- finn-rtllib/thresholding/hdl/axilite_if.v | 59 ++++++++++--------- finn-rtllib/thresholding/hdl/thresholding.sv | 2 +- .../thresholding/hdl/thresholding_axi.sv | 2 +- .../hdl/thresholding_template_wrapper.v | 39 ++++++------ finn-rtllib/thresholding/sim/thresh_gen.sv | 30 ++++++++++ finn-rtllib/thresholding/sim/thresholding.tcl | 17 ------ .../thresholding/sim/thresholding_axi_tb.sv | 2 +- .../thresholding/sim/thresholding_tb.sv | 2 +- .../fpgadataflow/rtl/thresholding_rtl.py | 12 +--- src/finn/util/basic.py | 4 +- 10 files changed, 87 insertions(+), 82 deletions(-) delete mode 100644 finn-rtllib/thresholding/sim/thresholding.tcl diff --git a/finn-rtllib/thresholding/hdl/axilite_if.v b/finn-rtllib/thresholding/hdl/axilite_if.v index bdd4de288e..2aeff770d2 100644 --- a/finn-rtllib/thresholding/hdl/axilite_if.v +++ b/finn-rtllib/thresholding/hdl/axilite_if.v @@ -1,32 +1,33 @@ -/* - Copyright (c) 2020, Xilinx - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of FINN nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ module axi4lite_if #( diff --git a/finn-rtllib/thresholding/hdl/thresholding.sv b/finn-rtllib/thresholding/hdl/thresholding.sv index 4c83c8e9db..2e4d419746 100644 --- a/finn-rtllib/thresholding/hdl/thresholding.sv +++ b/finn-rtllib/thresholding/hdl/thresholding.sv @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index 1f235b9486..5c7182b214 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v index ef76a23cbc..f35db156f6 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v +++ b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v @@ -1,31 +1,32 @@ -/** - * Copyright (c) 2023, Xilinx +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * - * * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. * - * * Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. * - * * Neither the name of FINN nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * @author Thomas B. Preußer * @brief Verilog wrapper for IP packaging. diff --git a/finn-rtllib/thresholding/sim/thresh_gen.sv b/finn-rtllib/thresholding/sim/thresh_gen.sv index 713723aafa..ae30503f8f 100644 --- a/finn-rtllib/thresholding/sim/thresh_gen.sv +++ b/finn-rtllib/thresholding/sim/thresh_gen.sv @@ -1,3 +1,33 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ module thresh_gen; localparam int unsigned K = 9; localparam int unsigned N = 4; diff --git a/finn-rtllib/thresholding/sim/thresholding.tcl b/finn-rtllib/thresholding/sim/thresholding.tcl deleted file mode 100644 index 82dc59deb1..0000000000 --- a/finn-rtllib/thresholding/sim/thresholding.tcl +++ /dev/null @@ -1,17 +0,0 @@ -create_project -force thresholding thresholding.vivado -part xcvc1902-vsva2197-2MP-e-S -set_property board_part xilinx.com:vck190:part0:2.2 [current_project] - -read_verilog hdl/axilite_if.v -read_verilog -sv { hdl/thresholding.sv hdl/thresholding_axi.sv } - -set simset [current_fileset -simset] -set_property -name xsim.simulate.log_all_signals -value true -objects $simset -set_property -name xsim.simulate.runtime -value all -objects $simset -add_files -fileset $simset { sim/thresholding_tb.sv sim/thresholding_axi_tb.sv } - -foreach top { thresholding_tb thresholding_axi_tb } { - set_property top $top $simset - - launch_simulation - close_sim -} diff --git a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv index 918f539d15..429fb7776f 100644 --- a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv +++ b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/finn-rtllib/thresholding/sim/thresholding_tb.sv b/finn-rtllib/thresholding/sim/thresholding_tb.sv index 3f4ca61a85..1564f28f0d 100644 --- a/finn-rtllib/thresholding/sim/thresholding_tb.sv +++ b/finn-rtllib/thresholding/sim/thresholding_tb.sv @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 04a1815a32..02133dff39 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -60,16 +60,6 @@ except ModuleNotFoundError: PyVerilator = None -"""@package Thresholding_rtl -- ONNX i/o tensor shape assumptions for Thresholding: -- input 0 is the input tensor, shape (..., NumChannels) -- input 1 is the threshold tensor, shape (NumChannels, n_thres) -- output 0 is the output tensor, shape (..., NumChannels) - same as input -- the '...' here can be any shape (representing groups of vectors) - -This module creates an RTL IP, HLS is not supported. See 'thresholding_batch' -for a HLS equivalent. -""" class Thresholding_rtl(Thresholding, RTLBackend): """Class that corresponds to finn-rtllib 'thresholding' function.""" diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 0a6c0b39c9..077e45200d 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -11,7 +11,7 @@ # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # -# * Neither the name of Xilinx nor the names of its +# * Neither the name of FINN nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # From 89378115f6000439e7fd934fa0455d749420a94d Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 23 Feb 2024 11:49:15 +0000 Subject: [PATCH 153/291] [CustomOp] Move calc_tmem to abstraction layer Signed-off-by: aziz bahri --- src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py | 6 ------ src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py | 6 ------ src/finn/custom_op/fpgadataflow/thresholding.py | 6 ++++++ 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 16dee92e8a..07fe4296e3 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -67,12 +67,6 @@ def get_nodeattr_types(self): my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def calc_tmem(self): - """Calculates and returns TMEM.""" - mh = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - return mh // pe - def bram_estimation(self): """Calculates BRAM cost if resource set to BRAM""" style = self.get_nodeattr("ram_style") diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 02133dff39..cdca8cc373 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -120,12 +120,6 @@ def get_memory_estimate(self): res_dict[res_type] = res_dict.get(res_type, 0) + pe * res_count return res_dict - def calc_tmem(self): - """Calculates and returns TMEM.""" - num_channels = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - return num_channels // pe - def infer_node_datatype(self, model): """Used for FINN DataType inference: set the output tensors' datatypes accordingly for this node""" diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index 945ec16cf0..73c5ecf997 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -205,3 +205,9 @@ def execute_node(self, context, graph): # signed offset y += act.min() context[node.output[0]] = y + + def calc_tmem(self): + """Calculates and returns TMEM.""" + num_channels = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return num_channels // pe \ No newline at end of file From ce14ea228920c75c3b7820d6e660495c790ef099 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 23 Feb 2024 11:52:36 +0000 Subject: [PATCH 154/291] [RTL layers] Default to parent execute node function for cppsim --- src/finn/custom_op/fpgadataflow/hwcustomop.py | 16 +-- .../rtl/convolutioninputgenerator_rtl.py | 111 ++++++++++-------- .../fpgadataflow/rtl/fmpadding_rtl.py | 87 ++++++++------ .../rtl/streamingdatawidthconverter_rtl.py | 86 +++++++------- .../fpgadataflow/set_exec_mode.py | 47 +++----- .../test_fpgadataflow_convinputgenerator.py | 9 +- tests/fpgadataflow/test_fpgadataflow_dwc.py | 14 +-- .../test_fpgadataflow_fmpadding.py | 2 - 8 files changed, 189 insertions(+), 183 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py index bf89bcc0b4..854587afc4 100644 --- a/src/finn/custom_op/fpgadataflow/hwcustomop.py +++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py @@ -170,27 +170,27 @@ def uram_efficiency_estimation(self): def bram_estimation(self): """Function for BRAM resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def uram_estimation(self): """Function for UltraRAM resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def lut_estimation(self): """Function for LUT resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def dsp_estimation(self): """Function for DSP resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def get_exp_cycles(self): """Function for estimation of expected cycles for set folding, - is member function of HLSCustomOp class but has to be filled + is member function of HWCustomOp class but has to be filled by every node""" return 0 @@ -316,14 +316,14 @@ def rtlsim_multi_io(self, sim, io_dict): def generate_params(self, model, path): """Function to generate parameters (i.e. weights and thresholds), - is member function of HLSCustomOp class but has to be filled - by every node.""" + is member function of HWCustomOp class but has to be filled + by every node that needs to generate parameters.""" pass @abstractmethod def get_number_output_values(self): """Function to get the number of expected output values, - is member function of HLSCustomOp class but has to be filled + is member function of HWCustomOp class but has to be filled by every node.""" pass diff --git a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py index 6f4bafd73a..aebbc6c646 100755 --- a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py @@ -30,6 +30,7 @@ import numpy as np import os import shutil +import warnings from qonnx.core.datatype import DataType from qonnx.custom_op.general import im2col from qonnx.custom_op.general.im2col import compute_conv_output_dim @@ -285,15 +286,68 @@ def uram_estimation(self): def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if mode == "cppsim": - raise Exception("cppsim not possible for RTL SWG, please set exec_mode to rtlsim") + warnings.warn( + """RTL components cannot be executed with cppsim. + By default the execution of the HW abstraction parent will be used.""" + ) + ConvolutionInputGenerator.execute_node(self, context, graph) elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} @@ -302,51 +356,6 @@ def execute_node(self, context, graph): ) ) - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output - shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" - def prepare_codegen_default(self): """Fills code generation dict for the default implementation style by computing the incremental addressing scheme for the circular buffer.""" diff --git a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py index b8a1505018..713fd81da6 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py @@ -30,6 +30,7 @@ import numpy as np import os import shutil +import warnings from qonnx.util.basic import roundup_to_integer_multiple from finn.custom_op.fpgadataflow.fmpadding import FMPadding @@ -70,15 +71,56 @@ def get_verilog_top_module_intf_names(self): def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if mode == "cppsim": - raise Exception("cppsim not possible for FMPadding_rtl, please set exec_mode to rtlsim") + warnings.warn( + """RTL components cannot be executed with cppsim. + By default the execution of the HW abstraction parent will be used.""" + ) + FMPadding.execute_node(self, context, graph) elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim_H, OutputDim_W, NumChannels).""" + else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} @@ -87,39 +129,6 @@ def execute_node(self, context, graph): ) ) - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim_H, OutputDim_W, NumChannels).""" - def get_template_values(self, ifm_dims, pads, chans, simd, idt): dimY, dimX = ifm_dims padT, padL, padB, padR = pads diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py index 6fcfaa1db0..2b1ff019ac 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py @@ -29,6 +29,7 @@ import numpy as np import os import shutil +import warnings from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( @@ -73,18 +74,55 @@ def check_divisible_iowidths(self): def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if mode == "cppsim": - raise Exception( - """cppsim not possible for StreamingDataWidthConverter_rtl, - please set exec_mode to rtlsim""" + warnings.warn( + """RTL components cannot be executed with cppsim. + By default the execution of the HW abstraction parent will be used.""" ) + StreamingDataWidthConverter.execute_node(self, context, graph) elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == tuple( + exp_ishape + ), """Input shape doesn't + match expected shape.""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + assert context[node.output[0]].shape == tuple( + exp_oshape + ), """Output shape doesn't match expected shape.""" else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} @@ -93,38 +131,6 @@ def execute_node(self, context, graph): ) ) - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == tuple( - exp_ishape - ), """Input shape doesn't - match expected shape.""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - - assert context[node.output[0]].shape == tuple( - exp_oshape - ), """Output shape doesn't match expected shape.""" - def get_template_values(self): topname = self.get_verilog_top_module_name() ibits = self.get_instream_width() diff --git a/src/finn/transformation/fpgadataflow/set_exec_mode.py b/src/finn/transformation/fpgadataflow/set_exec_mode.py index 1b5a510d2f..405ddb0c42 100644 --- a/src/finn/transformation/fpgadataflow/set_exec_mode.py +++ b/src/finn/transformation/fpgadataflow/set_exec_mode.py @@ -36,38 +36,29 @@ class SetExecMode(Transformation): """Set attribute exec_mode in all fpgadataflow nodes to specify which kind of execution should be used ("cppsim" or "rtlsim"). - Note that RTL components do not support cppsim. - For now, only a model consisting of 100% of HLS layers can be executed - using cppsim.""" + Note that RTL components do not support cppsim. When cppsim is selected + for RTL components, by default the execution of the HW op parent is + executed.""" def __init__(self, mode): super().__init__() self.mode = mode def apply(self, model): - mode = self.mode - # if "cppsim" selected, check if model does not contain RTL layers - if mode == "cppsim" and any(is_rtl_node(node) for node in model.graph.node): - raise Exception( - """Model contains RTL layers, - cppsim can only be used on models consisting of HLS layers - and non fpgadataflow nodes.""" - ) - else: - for node in model.graph.node: - op_type = node.op_type - if is_hls_node(node) or is_rtl_node(node): - try: - # lookup op_type in registry of CustomOps - inst = registry.getCustomOp(node) - # set sim_mode accordingly to argument mode - inst.set_nodeattr("exec_mode", mode) - # ensure that sim_mode is now set - assert ( - inst.get_nodeattr("exec_mode") != "" - ), """Transformation - was not successful. Node attribute "exec_mode" is not set""" - except KeyError: - # exception if op_type is not supported - raise Exception("Custom op_type %s is currently not supported." % op_type) + for node in model.graph.node: + op_type = node.op_type + if is_hls_node(node) or is_rtl_node(node): + try: + # lookup op_type in registry of CustomOps + inst = registry.getCustomOp(node) + # set sim_mode accordingly to argument mode + inst.set_nodeattr("exec_mode", self.mode) + # ensure that sim_mode is now set + assert ( + inst.get_nodeattr("exec_mode") != "" + ), """Transformation + was not successful. Node attribute "exec_mode" is not set""" + except KeyError: + # exception if op_type is not supported + raise Exception("Custom op_type %s is currently not supported." % op_type) return (model, False) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 1a9a934df1..1fe96d6bd7 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -202,12 +202,9 @@ def test_fpgadataflow_slidingwindow( inst.set_nodeattr("parallel_window", parallel_window) if exec_mode == "cppsim": - if model.graph.node[0].op_type == "ConvolutionInputGenerator_rtl": - pytest.skip("cppsim not supported for RTL DWC") - else: - model = model.transform(SetExecMode("cppsim")) - model = model.transform(PrepareCppSim()) - model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) elif exec_mode == "rtlsim": model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py index d46815ebac..7152d32a7b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_dwc.py +++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py @@ -96,7 +96,7 @@ def prepare_inputs(input_tensor, dt): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_dwc_rtlsim(config, exec_mode): +def test_fpgadataflow_dwc(config, exec_mode): shape, inWidth, outWidth, finn_dtype = config test_fpga_part = "xc7z020clg400-1" @@ -114,16 +114,12 @@ def test_fpgadataflow_dwc_rtlsim(config, exec_mode): assert y.shape == tuple(shape), """The output shape is incorrect.""" model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) if exec_mode == "cppsim": - if model.graph.node[0].op_type == "StreamingDataWidthConverter_rtl": - pytest.skip("cppsim not supported for RTL DWC") - else: - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareCppSim()) - model = model.transform(CompileCppSim()) - model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) elif exec_mode == "rtlsim": - model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, 5)) model = model.transform(HLSSynthIP()) model = model.transform(SetExecMode("rtlsim")) diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index 12c84e7221..45cc265ac7 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -113,8 +113,6 @@ def make_single_fmpadding_modelwrapper(impl_style, idim, padding, num_ch, simd, @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): - if impl_style == "rtl" and mode == "cppsim": - pytest.skip("rtl implstyle has no cppsim, skipping") if num_ch % simd != 0: pytest.skip(" num_ch % simd != 0, skipping") From 243883137ede3a8425a52897b559f2c3558793d6 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 23 Feb 2024 14:39:02 +0000 Subject: [PATCH 155/291] [CustomOps] threshold mem_mode for HLS variant only --- src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py | 4 ++++ src/finn/custom_op/fpgadataflow/thresholding.py | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 07fe4296e3..7afc42b6e7 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -60,6 +60,10 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): my_attrs = { + # memory mode for the thresholds + # const -- embedded thresholds, default + # decoupled -- streaming thresholds with streamer packaged inside IP + "mem_mode": ("s", False, "const", {"const", "decoupled"}), # string defining memory type "ram_style": ("s", False, "distributed", {"distributed", "block"}), } diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index 73c5ecf997..a2f4b7e624 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -42,10 +42,6 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): my_attrs = { - # memory mode for the thresholds - # const -- embedded thresholds, default - # decoupled -- streaming thresholds with streamer packaged inside IP - "mem_mode": ("s", False, "const", {"const", "decoupled"}), # whether weights (thresholds) will be # writable through an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. From e3e8c974d3adefc630019ad35931a23e318a0705 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 23 Feb 2024 14:44:31 +0000 Subject: [PATCH 156/291] [Transform] Clean up SpecializeLayers transform --- .../transformation/fpgadataflow/specialize_layers.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 6c94f45d16..a8e8fc72c1 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -34,20 +34,10 @@ from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants -restricted_layers = [] -restricted_layers.append("MVAU") -restricted_layers.append("VectorVectorActivation") -restricted_layers.append("Thresholding") - def _determine_impl_style(node): optype = node.op_type - # if rtl variant has specific restrictions - # use always the hls variant for now - if optype in restricted_layers: - return "hls" - # check if there is an HLS or RTL variant or both hls_variant = optype + "_hls" in hls_variants.keys() rtl_variant = optype + "_rtl" in rtl_variants.keys() @@ -77,7 +67,7 @@ def _determine_impl_style(node): # check if user setting can be fulfilled # otherwise change impl_style - if impl_style == "hls": + elif impl_style == "hls": if optype == "ConvolutionInputGenerator": if not _swg_hls_possible(node): warn_str = ( From 55671acc2116ba8fb2e59de9352fc2ca91beab87 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 23 Feb 2024 17:02:08 +0000 Subject: [PATCH 157/291] [Transform] Cleanup InsertDWC check if node is dwc node --- src/finn/transformation/fpgadataflow/insert_dwc.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 96c114498c..33cc3e86d3 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -35,10 +35,7 @@ def _is_dwc_node(node): - if node.op_type.startswith("StreamingDataWidthConverter"): - return True - else: - return False + return node.op_type.startswith("StreamingDataWidthConverter") def _suitable_node(node): From b60dc425578266feb68c7c3ad7fd591189da0b88 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 23 Feb 2024 17:30:37 +0000 Subject: [PATCH 158/291] [RTL layers] Remove warning for cppsim --- .../fpgadataflow/rtl/convolutioninputgenerator_rtl.py | 5 ----- src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py | 5 ----- .../fpgadataflow/rtl/streamingdatawidthconverter_rtl.py | 5 ----- 3 files changed, 15 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py index aebbc6c646..08564ca6da 100755 --- a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py @@ -30,7 +30,6 @@ import numpy as np import os import shutil -import warnings from qonnx.core.datatype import DataType from qonnx.custom_op.general import im2col from qonnx.custom_op.general.im2col import compute_conv_output_dim @@ -289,10 +288,6 @@ def execute_node(self, context, graph): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if mode == "cppsim": - warnings.warn( - """RTL components cannot be executed with cppsim. - By default the execution of the HW abstraction parent will be used.""" - ) ConvolutionInputGenerator.execute_node(self, context, graph) elif mode == "rtlsim": node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py index 713fd81da6..19765d64c4 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py @@ -30,7 +30,6 @@ import numpy as np import os import shutil -import warnings from qonnx.util.basic import roundup_to_integer_multiple from finn.custom_op.fpgadataflow.fmpadding import FMPadding @@ -74,10 +73,6 @@ def execute_node(self, context, graph): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if mode == "cppsim": - warnings.warn( - """RTL components cannot be executed with cppsim. - By default the execution of the HW abstraction parent will be used.""" - ) FMPadding.execute_node(self, context, graph) elif mode == "rtlsim": node = self.onnx_node diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py index 2b1ff019ac..ef918b5db8 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py @@ -29,7 +29,6 @@ import numpy as np import os import shutil -import warnings from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( @@ -77,10 +76,6 @@ def execute_node(self, context, graph): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if mode == "cppsim": - warnings.warn( - """RTL components cannot be executed with cppsim. - By default the execution of the HW abstraction parent will be used.""" - ) StreamingDataWidthConverter.execute_node(self, context, graph) elif mode == "rtlsim": node = self.onnx_node From e7c1e5fddb0c178e3c3a474d970061e1783ea3b3 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Wed, 28 Feb 2024 11:37:37 +0000 Subject: [PATCH 159/291] [CustomOp] restructure class methods from class hierachy --- .../fpgadataflow/hls/thresholding_hls.py | 6 +- .../fpgadataflow/rtl/thresholding_rtl.py | 140 +----------------- .../custom_op/fpgadataflow/thresholding.py | 59 +++++++- 3 files changed, 58 insertions(+), 147 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 7afc42b6e7..7b9809f495 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -105,11 +105,7 @@ def lut_estimation(self): def get_weightstream_width(self): """Returns weight stream width. Used only in decoupled mode.""" if self.get_nodeattr("mem_mode") == "decoupled": - pe = self.get_nodeattr("PE") - wp = self.get_weight_datatype().bitwidth() - n_thres_steps = self.get_nodeattr("numSteps") - w_width = pe * wp * n_thres_steps - return w_width + return super().get_weightstream_width() else: return 0 diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index cdca8cc373..007c322dea 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -154,128 +154,6 @@ def lut_estimation(self): res_dict = self.get_memory_estimate() return res_dict.get("LUTRAM", 0) - def get_input_datatype(self, ind=0): - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - return DataType[self.get_nodeattr("outputDataType")] - - def get_weight_datatype(self): - """The term 'weights' and 'thresholds' are used interchangably in this class.""" - return DataType[self.get_nodeattr("weightDataType")] - - def minimize_accumulator_width(self, model): - "Minimize threshold width ('accumulator width' here due to convention)" - thresholds = model.get_initializer(self.onnx_node.input[1]) - threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - min_input = self.get_input_datatype().min() - max_input = self.get_input_datatype().max() - # get range required by threshold values - tdt_min = min(min_input, min_threshold) - tdt_max = max(max_input, max_threshold) - if tdt_min < 0: - if abs(tdt_min) > tdt_max: - tdt = DataType.get_smallest_possible(tdt_min) - else: - tdt = DataType.get_smallest_possible(-tdt_max - 1) - else: - tdt = DataType.get_smallest_possible(tdt_max) - assert np.vectorize(tdt.allowed)( - threshold_tensor - ).all(), "Thresholds can't be expressed with type %s" % str(tdt) - self.set_nodeattr("weightDataType", tdt.name) - return DataType[self.get_nodeattr("weightDataType")] - - def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - return i_bits * self.get_nodeattr("PE") - - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - return o_bits * self.get_nodeattr("PE") - - def get_weightstream_width(self): - """Returns weight stream width""" - pe = self.get_nodeattr("PE") - wp = self.get_weight_datatype().bitwidth() - n_thres_steps = self.get_nodeattr("numSteps") - w_width = pe * wp * n_thres_steps - return w_width - - def get_folded_input_shape(self, ind=0): - fold = self.calc_tmem() - pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) - folded_input_shape = tuple(vecs + [fold, pe]) - return folded_input_shape - - def get_folded_output_shape(self, ind=0): - # same shape as input - return self.get_folded_input_shape() - - def get_normal_input_shape(self, ind=0): - num_channels = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_input_shape = tuple(vecs + [num_channels]) - return normal_input_shape - - def get_normal_output_shape(self, ind=0): - # same shape as input - return self.get_normal_input_shape() - - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 - * for unsigned inputs, ensure thresholds are positive - * interleave rows between PEs - * reshape into (PE, TMEM, n_thres_steps) and return - """ - mh = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - tmem = mh // pe - assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." - assert ( - orig_thres_matrix.ndim == 2 - ), """Threshold matrix dimension is - not as expected (2).""" - n_thres_steps = orig_thres_matrix.shape[1] - assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" - if not self.get_input_datatype().signed(): - # ensure all thresholds are nonnegative - assert (orig_thres_matrix >= 0).all() - # ensure all thresholds are integer - assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" - ret = orig_thres_matrix - # ensure channels = mh , duplicating if necessary - if ret.shape[0] == 1: - ret = np.tile(ret, (mh, 1)) - assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - assert ( - ret.shape[0] == pe - ), """First dimension after distribution of the - rows between PEs is not as expected (pe)""" - assert ( - ret.shape[1] == tmem - ), """Second dimension after distribution of the - rows between PEs is not as expected (tmem)""" - assert ( - ret.shape[2] == n_thres_steps - ), """Third dimension after distribution of the - rows between PEs is not as expected (n_thres_steps)""" - return ret.reshape(1, pe, tmem, n_thres_steps) - def get_all_meminit_filenames(self, abspath=False): "Return a list of all .dat memory initializer files used for this node" dat_files = [] @@ -623,23 +501,7 @@ def code_generation_ipi(self): return cmd def get_verilog_top_module_intf_names(self): - """Return a dict of names of input and output interfaces. - The keys reflect the protocols each interface implements: - 'clk', 'rst', 'm_axis', 's_axis', 'aximm', 'axilite'. - Values are lists of tuples (axis, aximm) or names (axilite): - 'axis' tuples correspond to the list of node inputs in order, - each tuple is (interface_name, interface_width_bits). - axilite always assumed to be 32 bits and is not tuple (name only). - Each block must have at most one aximm and one axilite.""" - - intf_names = {} - intf_names["clk"] = ["ap_clk"] - intf_names["rst"] = ["ap_rst_n"] - intf_names["s_axis"] = [("in0_V", self.get_instream_width_padded())] - intf_names["m_axis"] = [("out_V", self.get_outstream_width_padded())] - intf_names["aximm"] = [] - intf_names["axilite"] = [] - intf_names["ap_none"] = [] + intf_names = super().get_verilog_top_module_intf_names() if self.get_nodeattr("runtime_writeable_weights") == 1: intf_names["axilite"] = ["s_axilite"] diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index a2f4b7e624..d3ba724818 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -30,7 +30,7 @@ import warnings from qonnx.core.datatype import DataType from qonnx.custom_op.general.multithreshold import multithreshold - +from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp @@ -122,10 +122,18 @@ def get_weight_datatype(self): """Returns FINN DataType of thresholds, here called weights.""" return DataType[self.get_nodeattr("weightDataType")] + def get_weightstream_width(self): + """Returns weight stream width""" + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + n_thres_steps = self.get_nodeattr("numSteps") + w_width = pe * wp * n_thres_steps + return w_width + def minimize_accumulator_width(self, model): "Minimize threshold width ('accumulator width' here due to convention)" thresholds = model.get_initializer(self.onnx_node.input[1]) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) min_threshold = thresholds.min() max_threshold = thresholds.max() min_input = self.get_input_datatype().min() @@ -159,7 +167,7 @@ def get_outstream_width(self, ind=0): def get_folded_input_shape(self, ind=0): ich = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") - fold = ich // pe + fold = self.calc_tmem() vecs = list(self.get_nodeattr("numInputVectors")) folded_input_shape = tuple(vecs + [fold, pe]) return folded_input_shape @@ -186,6 +194,51 @@ def get_exp_cycles(self): # Channels/PE * batch size * fmdim * fmdim return np.prod(self.get_folded_output_shape()[:-1]) + + def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for unsigned inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ + mh = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + tmem = mh // pe + assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." + assert ( + orig_thres_matrix.ndim == 2 + ), """Threshold matrix dimension is + not as expected (2).""" + n_thres_steps = orig_thres_matrix.shape[1] + assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" + if not self.get_input_datatype().signed(): + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" + ret = orig_thres_matrix + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (mh, 1)) + assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + assert ( + ret.shape[2] == n_thres_steps + ), """Third dimension after distribution of the + rows between PEs is not as expected (n_thres_steps)""" + return ret.reshape(1, pe, tmem, n_thres_steps) + def execute_node(self, context, graph): node = self.onnx_node inp_values = context[node.input[0]] From d16d493ad99a1758b04df37cb96d6cd7b7074308 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 1 Mar 2024 11:18:28 +0000 Subject: [PATCH 160/291] [CustomOp] Remove redudent methods from thresholding rtl --- .../fpgadataflow/rtl/thresholding_rtl.py | 53 ++++++++----------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 007c322dea..c112125925 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -429,7 +429,28 @@ def execute_node(self, context, graph): "inputs": {istream_name: inp}, "outputs": {ostream_name: []}, } - self.rtlsim_multi_io(sim, io_dict) + + trace_file = self.get_nodeattr("rtlsim_trace") + if trace_file == "default": + trace_file = self.onnx_node.name + ".vcd" + sname = "_" + + # Change into so directory to ensure threshold files can be found + rtlsim_so = self.get_nodeattr("rtlsim_so") + so_dir = os.path.dirname(os.path.realpath(rtlsim_so)) + olcwd = os.getcwd() + os.chdir(so_dir) + num_out_values = self.get_number_output_values() + reset_rtlsim(sim) + total_cycle_count = rtlsim_multi_io(sim, + io_dict, + num_out_values, + trace_file=trace_file, + sname=sname, + liveness_threshold=pyverilate_get_liveness_threshold_cycles() + ) + self.set_nodeattr("cycles_rtlsim", total_cycle_count) + os.chdir(olcwd) output = io_dict["outputs"][ostream_name] # Manage output data @@ -448,36 +469,6 @@ def execute_node(self, context, graph): context[node.output[0]] = output return - def rtlsim_multi_io(self, sim, io_dict): - "Run rtlsim for this node, supports multiple i/o streams." - - rtlsim_so = self.get_nodeattr("rtlsim_so") - so_dir = os.path.dirname(os.path.realpath(rtlsim_so)) - olcwd = os.getcwd() - os.chdir(so_dir) - - # signal name prefix - # TODO if the interface names on this component get standardized, - # it won't need its own rtlsim_multi_io variant anymore and can just - # use the base class one - sname = "_" - - trace_file = self.get_nodeattr("rtlsim_trace") - if trace_file == "default": - trace_file = self.onnx_node.name + ".vcd" - num_out_values = self.get_number_output_values() - reset_rtlsim(sim) - total_cycle_count = rtlsim_multi_io( - sim, - io_dict, - num_out_values, - trace_file=trace_file, - sname=sname, - liveness_threshold=pyverilate_get_liveness_threshold_cycles(), - ) - self.set_nodeattr("cycles_rtlsim", total_cycle_count) - os.chdir(olcwd) - def code_generation_ipi(self): """Constructs and returns the TCL commands for node instantiation as an RTL block.""" From d612c29dccb82ba070af729b287010bfcf8fc4f7 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 1 Mar 2024 12:01:23 +0000 Subject: [PATCH 161/291] [CustomOp] clean up threshold weight generation --- .../fpgadataflow/rtl/thresholding_rtl.py | 83 ++++++++----------- 1 file changed, 34 insertions(+), 49 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index c112125925..26387a7192 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -565,7 +565,7 @@ def generate_params(self, model, path): """Please set mem_mode to "const", "decoupled", currently no other parameter value is supported!""" ) - def make_weight_file(self, weights, weight_file_mode, weight_file_name): + def make_weight_file(self, weights, weight_file_name): """Produce a file containing given weights (thresholds) in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig of weights. @@ -573,8 +573,6 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): Arguments: * weights : numpy array with weights to be put into the file - * weight_file_mode : one of { decoupled_verilog_dat, - decoupled_runtime} * weight_file_name : filename for the weight file to be generated """ @@ -583,49 +581,36 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): assert np.vectorize(tdt.allowed)( threshold_tensor ).all(), "Thresholds can't be expressed with type %s" % str(tdt) - if "decoupled" in weight_file_mode: - # streaming thresholds need to be organized differently - # (1, pe, tmem, n_thres_steps) -> (1, tmem, pe, n_thres_steps) - decoupled_thres = np.transpose(threshold_tensor, (0, 2, 1, 3)) - # TODO add flips/reversals as needed here - # (1, tmem, pe, n_thres_steps) -(1, tmem, pe * n_thres_steps) - pe = self.get_nodeattr("PE") - ch = self.get_nodeattr("NumChannels") - n_thres_steps = self.get_nodeattr("numSteps") - decoupled_thres_pe_flipped = np.flip(decoupled_thres, axis=-2) - decoupled_thres = decoupled_thres.reshape(1, -1, pe * n_thres_steps) - decoupled_thres = decoupled_thres.copy() - decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.reshape( - 1, -1, pe * n_thres_steps - ) - decoupled_thres_pe_flipped = decoupled_thres_pe_flipped.copy() - width_padded = roundup_to_integer_multiple(weights.shape[1], 4) - weight_padded = np.zeros((weights.shape[0],width_padded)) - weight_padded[:weights.shape[0], :n_thres_steps ] = weights - weight_stream = [] - wdt = self.get_weight_datatype() - bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32) - padding = np.zeros(width_padded, dtype=np.int32) - - chan_ind = 0 - cf = ch//pe - for fold in range(cf): - for c in range(2**(pe-1).bit_length()): - if (c==0 or c%pe != 0) and c < pe: - for w in weight_padded[chan_ind]: - w_packed = pack_innermost_dim_as_hex_string( - [w], wdt, bw_hexdigit, prefix="" - ).item() - weight_stream.append(w_packed) - chan_ind +=1 - else: - for z in padding: - w_packed = pack_innermost_dim_as_hex_string( - [z], wdt, bw_hexdigit, prefix="" - ).item() - weight_stream.append(w_packed) - with open(weight_file_name, "w") as f: - for val in weight_stream: - f.write(val + "\n") - else: - raise Exception("Unknown weight_file_mode") \ No newline at end of file + + pe = self.get_nodeattr("PE") + ch = self.get_nodeattr("NumChannels") + n_thres_steps = self.get_nodeattr("numSteps") + + width_padded = roundup_to_integer_multiple(weights.shape[1], 4) + weight_padded = np.zeros((weights.shape[0],width_padded)) + weight_padded[:weights.shape[0], :n_thres_steps ] = weights + weight_stream = [] + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32) + padding = np.zeros(width_padded, dtype=np.int32) + + chan_ind = 0 + cf = ch//pe + for fold in range(cf): + for c in range(2**(pe-1).bit_length()): + if (c==0 or c%pe != 0) and c < pe: + for w in weight_padded[chan_ind]: + w_packed = pack_innermost_dim_as_hex_string( + [w], wdt, bw_hexdigit, prefix="" + ).item() + weight_stream.append(w_packed) + chan_ind +=1 + else: + for z in padding: + w_packed = pack_innermost_dim_as_hex_string( + [z], wdt, bw_hexdigit, prefix="" + ).item() + weight_stream.append(w_packed) + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") From 503efe7b55e8269a42ca0ea780c377461daf60ff Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 1 Mar 2024 17:52:45 +0000 Subject: [PATCH 162/291] [CustomOps] make weight files during HDL file generation --- .../custom_op/fpgadataflow/rtl/thresholding_rtl.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 26387a7192..4adde1452d 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -31,13 +31,9 @@ import os import shutil import warnings -from math import ceil, log2 from pyverilator.util.axi_utils import rtlsim_multi_io, reset_rtlsim from qonnx.core.datatype import DataType -from qonnx.util.basic import ( - interleave_matrix_outer_dim_from_partitions, - roundup_to_integer_multiple, -) +from qonnx.util.basic import roundup_to_integer_multiple from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.custom_op.fpgadataflow.thresholding import Thresholding @@ -316,6 +312,10 @@ def generate_hdl(self, model): # Retrieve the destination directory for the final RTL files code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + weights = model.get_initializer(self.onnx_node.input[1]) + weights_fname = f"{code_gen_dir}/memblock.dat" + self.make_weight_file(weights,"decoupled", weights_fname) + for rtl_file_path in self.get_rtl_file_paths(): # read in original RTL template file template_data = self.get_rtl_template_data(rtl_file_path) @@ -565,7 +565,7 @@ def generate_params(self, model, path): """Please set mem_mode to "const", "decoupled", currently no other parameter value is supported!""" ) - def make_weight_file(self, weights, weight_file_name): + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights (thresholds) in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig of weights. From 2c50994abe7f16a41141a6a578355e4e2fec85bc Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 1 Mar 2024 17:53:21 +0000 Subject: [PATCH 163/291] [tests] threshold test get the right impl_style --- tests/fpgadataflow/test_fpgadataflow_thresholding.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index dfd14268e5..62d7b04278 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -274,7 +274,8 @@ def test_runtime_thresholds_read(impl_style,cfg): # Make sure that specialize layer did not default to HLS implementation assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) - op_inst = getCustomOp(model.graph.node[0]) + node = model.get_nodes_by_op_type(f"Thresholding_{impl_style}")[0] + op_inst = getCustomOp(node) op_inst.set_nodeattr("runtime_writeable_weights", 1) dat_fname = f"old_weights_{cfg}.dat" From d48c7119d30d936b5f32027c43a3a46825f6dcf1 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 1 Mar 2024 18:32:50 +0000 Subject: [PATCH 164/291] [CustomOp] Add doc string for memutil function Signed-off-by: aziz bahri --- src/finn/util/basic.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 077e45200d..49220e9718 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -270,6 +270,15 @@ def find_next_power_of_2(n): def get_memutil_alternatives( req_mem_spec, mem_primitives=mem_primitives_versal, sort_min_waste=True ): + '''Computes how many instances of a memory primitive are necessary to + implement a desired memory size, where req_mem_spec is the desired + size and the primitive_spec is the primitve size. The sizes are expressed + as tuples of (mem_width, mem_depth). Returns a list of tuples of the form + (primitive_name, (primitive_count, efficiency, waste)) where efficiency in + range [0,1] indicates how much of the total capacity is utilized, and waste + indicates how many bits of storage are wasted. If sort_min_waste is True, + the list is sorted by increasing waste. + ''' ret = [ (primitive_name, memutil(req_mem_spec, primitive_spec)) for (primitive_name, primitive_spec) in mem_primitives.items() From ff3458bc4388eb8fe787f360c1e061ed91b03182 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 4 Mar 2024 10:43:11 +0000 Subject: [PATCH 165/291] [build dataflow]: add fpgapart as argument to SpecializeLayers transform --- src/finn/builder/build_dataflow_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index a75bbe98a1..72463a3865 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -403,7 +403,7 @@ def step_specialize_layers(model: ModelWrapper, cfg: DataflowBuildConfig): if cfg.specialize_layers_config_file is not None: model = model.transform(GiveUniqueNodeNames()) model = model.transform(ApplyConfig(cfg.specialize_layers_config_file)) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(cfg._resolve_fpga_part())) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return model From 4f4385fd1defdb6adaf2d92ef7bfa7e64f716fba Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 4 Mar 2024 10:47:39 +0000 Subject: [PATCH 166/291] [hls mvau]: remove duplicate method --- .../fpgadataflow/hls/matrixvectoractivation_hls.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index e279d3953a..55a84b9dcb 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -172,19 +172,6 @@ def get_template_param_values(self): return ret - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - mem_mode = self.get_nodeattr("mem_mode") - sname = self.hls_sname() - if mem_mode == "external": - intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names - def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] From 055c8fe9a31544cae534a8d398d571772dbf15af Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 4 Mar 2024 10:51:16 +0000 Subject: [PATCH 167/291] [hw mvau]: move get_verilog_top_module_intf_names to hw-op abstraction layer --- .../fpgadataflow/matrixvectoractivation.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index ac173e4af6..dc713c8b42 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -542,7 +542,7 @@ def minimize_weight_bit_width(self, model): self.set_nodeattr("weightDataType", wdt.name) return DataType[self.get_nodeattr("weightDataType")] - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): """Convert the original numpy weight matrix orig_weight_matrix into a form suitable for passing to the hlslib call: * ensure MH % PE == 0 @@ -846,6 +846,19 @@ def derive_characteristic_fxns(self, period): io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + def code_generation_ipi(self): cmd = [] # add streamer if needed From fd0f796b735ad082219993bee968e82298f1256a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 4 Mar 2024 10:54:00 +0000 Subject: [PATCH 168/291] added MVAU_rtl custom-op --- src/finn/custom_op/fpgadataflow/rtl/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 914c033584..b7a798be98 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -30,6 +30,7 @@ ConvolutionInputGenerator_rtl, ) from finn.custom_op.fpgadataflow.rtl.fmpadding_rtl import FMPadding_rtl +from finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl import MVAU_rtl from finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl import ( StreamingDataWidthConverter_rtl, ) @@ -43,3 +44,4 @@ custom_op["FMPadding_rtl"] = FMPadding_rtl custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl +custom_op["MVAU_rtl"] = MVAU_rtl From 91a8c00cde59fb3bd995df7bb47e989823436abd Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 4 Mar 2024 10:56:29 +0000 Subject: [PATCH 169/291] [transform]: minor fix to extracting op_type from node, added fpgapart as argument to SpecializeLayers transform --- src/finn/transformation/fpgadataflow/set_fifo_depths.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 1e25670a71..ca7499428f 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -266,8 +266,7 @@ def apply(self, model): for node in model.graph.node: # verify assumptions assert is_fpgadataflow_node(node), "Found non-fpgadataflow node: " + str(node) - op_type = node.op_type - assert not op_type.startswith("StreamingFIFO"), "Found existing StreamingFIFO node" + assert not node.op_type.startswith("StreamingFIFO"), "Found existing StreamingFIFO node" node = getCustomOp(node) ifd = node.get_nodeattr("inFIFODepths") ofd = node.get_nodeattr("outFIFODepths") @@ -283,8 +282,7 @@ def apply(self, model): ofd[o] = np.prod(node.get_folded_output_shape(o)[:-1]) node.set_nodeattr("inFIFODepths", ifd) node.set_nodeattr("outFIFODepths", ofd) - - if op_type in extw_optypes: + if node.onnx_node.op_type in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) @@ -297,7 +295,7 @@ def apply(self, model): # insert stream infrastructure (DWC/FIFO) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(self.fpgapart)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) From d7f87146b8c01a4b3a2a85bc1ba6e9fc43165bb2 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 4 Mar 2024 10:58:38 +0000 Subject: [PATCH 170/291] [transform]: added fpgapart as attribute and functions to determine whether RTL MVU/VVU is supported --- .../fpgadataflow/specialize_layers.py | 74 +++++++++++++++++-- 1 file changed, 68 insertions(+), 6 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 6c94f45d16..2bfb32caf6 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -26,21 +26,22 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np import warnings from onnx import helper +from qonnx.core.datatype import DataType from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants +from finn.util.fpgadataflow import is_versal restricted_layers = [] -restricted_layers.append("MVAU") -restricted_layers.append("VectorVectorActivation") restricted_layers.append("Thresholding") -def _determine_impl_style(node): +def _determine_impl_style(node, fpgapart=""): optype = node.op_type # if rtl variant has specific restrictions @@ -62,10 +63,10 @@ def _determine_impl_style(node): if optype == "StreamingDataWidthConverter": return _dwc_determine_impl_style(node) if rtl_variant: - return "rtl" + impl_style = "rtl" # but if no rtl variant, set impl_style to hls elif hls_variant: - return "hls" + impl_style = "hls" # if there is neither an rtl nor hls variant # throw error else: @@ -121,6 +122,28 @@ def _determine_impl_style(node): else: # user setting can be fulfilled return "rtl" + elif optype == "MVAU": + if _mvu_rtl_possible(node): + if getCustomOp(node).get_nodeattr("noActivation") == 0: + # Split thresholding + pass + return "rtl" + else: + warn_str = """There is no RTL variant for %s. The node will automatically be + set to HLS variant.""" % ( + node.name, + ) + warnings.warn(warn_str) + elif optype == "VectorVectorActivation": + if _vvu_rtl_possible(node, fpgapart): + return "rtl" + else: + warn_str = """There is no RTL variant for %s. The node will automatically be + set to HLS variant.""" % ( + node.name, + ) + warnings.warn(warn_str) + if rtl_variant: return "rtl" elif hls_variant: @@ -194,9 +217,48 @@ def _swg_hls_possible(node): return False +def _mvu_rtl_possible(n): + # Checks whether RTL-based MVU is supported + act_width_in_range = ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 + ) or ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 + and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 + ) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + folding_supported = ( + getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0 + ) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0) + + return act_width_in_range and weight_width_in_range and folding_supported + + +def _vvu_rtl_possible(n, fpgapart): + # Checks whether RTL-based VVU is supported + act_width_in_range = ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 + ) or ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 + and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 + ) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + folding_supported = ( + getCustomOp(n).get_nodeattr("Channels") % getCustomOp(n).get_nodeattr("PE") == 0 + ) and ( + np.prod(getCustomOp(n).get_nodeattr("Kernel")) % getCustomOp(n).get_nodeattr("SIMD") == 0 + ) + is_versal_family = is_versal(fpgapart) + + return act_width_in_range and weight_width_in_range and folding_supported and is_versal_family + + class SpecializeLayers(Transformation): """Specialize all layers to either HLS or RTL variants""" + def __init__(self, fpgapart): + super().__init__() + self.fpgapart = fpgapart + def apply(self, model): graph = model.graph node_ind = 0 @@ -206,7 +268,7 @@ def apply(self, model): if not node.domain == "finn.custom_op.fpgadataflow": continue node_ind += 1 - impl_style = _determine_impl_style(node) + impl_style = _determine_impl_style(node, self.fpgapart) optype = node.op_type + "_" + impl_style new_node = helper.make_node( From 11d0c5ccbee3a8534df7dba5f76083d59c09f4b1 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 4 Mar 2024 10:59:29 +0000 Subject: [PATCH 171/291] [util]: added function to check if device is part of Versal family --- src/finn/util/fpgadataflow.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py index aae438fac2..3d3d343cd4 100644 --- a/src/finn/util/fpgadataflow.py +++ b/src/finn/util/fpgadataflow.py @@ -69,3 +69,11 @@ def is_rtl_node(node): is_node = True return is_node + + +def is_versal(fpgapart): + """Returns whether board is part of the Versal family""" + return ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) From ea6fb3529203b8c23e90d8159b3a27517dad955a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 4 Mar 2024 11:01:19 +0000 Subject: [PATCH 172/291] [rtl mvu/vvu]: rtl compute core, flow control and axi wrapper for MVU/VVU layers --- finn-rtllib/mvu/mvu_4sx4u.sv | 494 ++++++++++++++++++++++++++ finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 492 +++++++++++++++++++++++++ finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 430 ++++++++++++++++++++++ finn-rtllib/mvu/mvu_vvu_axi.sv | 375 +++++++++++++++++++ finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 97 +++++ finn-rtllib/mvu/replay_buffer.sv | 181 ++++++++++ 6 files changed, 2069 insertions(+) create mode 100644 finn-rtllib/mvu/mvu_4sx4u.sv create mode 100644 finn-rtllib/mvu/mvu_8sx8u_dsp48.sv create mode 100644 finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv create mode 100644 finn-rtllib/mvu/mvu_vvu_axi.sv create mode 100644 finn-rtllib/mvu/mvu_vvu_axi_wrapper.v create mode 100644 finn-rtllib/mvu/replay_buffer.sv diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv new file mode 100644 index 0000000000..aafe0e3429 --- /dev/null +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -0,0 +1,494 @@ +module mvu_4sx4u #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + + int unsigned VERSION = 1, + bit SIGNED_ACTIVATIONS = 0, + bit FORCE_BEHAVIORAL = 0 +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights + input logic [SIMD-1:0][3:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+3)/4; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 4*c; + localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); + localparam int unsigned PE_REM = 4*(c+1) - PE_END; + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD][3]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [17:0] bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] }; + logic [29:0] aa; + logic [26:0] dd; + logic [ 1:0] xx[3:1]; + if(1) begin : blkVectorize + uwire [3:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin + if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[pe + PE_REM][1]), + .O5(xx[pe + PE_REM][0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe + PE_REM]+:3] = ww[pe]; + aa[D[pe + PE_REM]+ 3] = ww[pe][3]; + end + end + end : blkVectorize + + uwire [47:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [17:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [45:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [47:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav +`ifndef VERILATOR + else begin : genDSP + localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; + uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; + case(VERSION) + 1: DSP48E1 #( + // Feature Control Attributes: Data Path Selection + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE) + .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE") + .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12") + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH" + .MASK('1), // 48-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 48-bit pattern match for pattern detect + .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2" + .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C") + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET") + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2) + .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1) + .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1) + .AREG(0), // Number of pipeline stages for A (0, 1 or 2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2) + .BREG(1), // Number of pipeline stages for B (0, 1 or 2) + .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1) + .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1) + .CREG(0), // Number of pipeline stages for C (0 or 1) + .DREG(0), // Number of pipeline stages for D (0 or 1) + .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1) + .MREG(1), // Number of multiplier pipeline stages (0 or 1) + .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1) + .PREG(1) // Number of pipeline stages for P (0 or 1) + ) dsp ( + // Cascade: 30-bit (each) output: Cascade Ports + .ACOUT(), // 30-bit output: A port cascade output + .BCOUT(), // 18-bit output: B port cascade output + .CARRYCASCOUT(), // 1-bit output: Cascade carry output + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output + .PCOUT(), // 48-bit output: Cascade output + + // Control: 1-bit (each) output: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc output + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output + .PATTERNDETECT(), // 1-bit output: Pattern detect output + .UNDERFLOW(), // 1-bit output: Underflow in add/acc output + + // Data: 4-bit (each) output: Data Ports + .CARRYOUT(), // 4-bit output: Carry output + .P(pp), // 48-bit output: Primary data output + + // Cascade: 30-bit (each) input: Cascade Ports + .ACIN('x), // 30-bit input: A cascade data input + .BCIN('x), // 18-bit input: B cascade input + .CARRYCASCIN('x), // 1-bit input: Cascade carry input + .MULTSIGNIN('x), // 1-bit input: Multiplier sign input + .PCIN('x), // 48-bit input: P cascade input + + // Control: 4-bit (each) input: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock input + .ALUMODE('0), // 4-bit input: ALU control input + .CARRYINSEL('0), // 3-bit input: Carry select input + .INMODE(5'b01100), // 5-bit input: INMODE control input + .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input + + // Data: 30-bit (each) input: Data Ports + .A(aa), // 30-bit input: A data input + .B(bb), // 18-bit input: B data input + .C('x), // 48-bit input: C data input + .CARRYIN('0), // 1-bit input: Carry input signal + .D(dd), // 25-bit input: D data input + + // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable input for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE + .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable input for CREG + .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable input for DREG + .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG + .CEM(en), // 1-bit input: Clock enable input for MREG + .CEP(en), // 1-bit input: Clock enable input for PREG + .RSTA('0), // 1-bit input: Reset input for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + 2: DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + default: initial begin + $error("Unknown version DSP48E%0d.", VERSION); + $finish; + end + endcase + end : genDSP +`endif + + // External Canary Pipeline + logic [1:0] X1[3:1] = '{ default: 0 }; + logic [1:0] X2[3:1] = '{ default: 0 }; + logic [1:0] X3[3:1] = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + foreach(X3[i]) begin + X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]); + end + end + end + + // Derive actual cross-lane overflows + for(genvar i = 0; i < 3; i++) begin + assign h3[s][i] = pp[D[i+1]+:2] - X3[i+1]; + end + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -8:0] hi4[3]; + uwire [$clog2(SIMD)+7:0] lo4[3]; + for(genvar i = 0; i < 4; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; + + // Conclusive high part accumulation + if(i >= PE_REM && i < 3) begin : genHi + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]); + end + assign hi4[i] = Hi4; + end : genHi + else if (i < 3) begin : genHiZero + assign hi4[i] = '0; + end : genHiZero + + // Conclusive low part accumulation + if(i >= PE_REM) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 3) assign up4 = Lo4; + else assign lo4[i] = Lo4; + end : blkLo + else begin : blkLoZero + assign lo4[i] = '0; + end : blkLoZero + + end + + // Stage #5: Resolve lane totals + logic signed [3:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[3] <= up4 - hi4[2]; + Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; + Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; + Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; + end + + end : genPipes + +endmodule : mvu_4sx4u diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv new file mode 100644 index 0000000000..1423153c97 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -0,0 +1,492 @@ +module mvu_8sx8u_dsp48 #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + + int unsigned VERSION = 1, + bit SIGNED_ACTIVATIONS = 0, + bit FORCE_BEHAVIORAL = 0 +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH; + localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+1)/2; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 2*c; + localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); + localparam int unsigned PE_REM = 2*(c+1) - PE_END; + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [17:0] bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; + logic [29:0] aa; + logic [26:0] dd; + logic [ 1:0] xx; + if(1) begin : blkVectorize + uwire [WEIGHT_WIDTH-1:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin + if(BEHAVIORAL) assign xx = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[1]), + .O5(xx[0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + end + end + end : blkVectorize + + uwire [47:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [17:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [45:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [47:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav +`ifndef VERILATOR + else begin : genDSP + localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; + uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; + case(VERSION) + 1: DSP48E1 #( + // Feature Control Attributes: Data Path Selection + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE) + .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE") + .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12") + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH" + .MASK('1), // 48-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 48-bit pattern match for pattern detect + .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2" + .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C") + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET") + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2) + .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1) + .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1) + .AREG(0), // Number of pipeline stages for A (0, 1 or 2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2) + .BREG(1), // Number of pipeline stages for B (0, 1 or 2) + .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1) + .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1) + .CREG(0), // Number of pipeline stages for C (0 or 1) + .DREG(0), // Number of pipeline stages for D (0 or 1) + .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1) + .MREG(1), // Number of multiplier pipeline stages (0 or 1) + .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1) + .PREG(1) // Number of pipeline stages for P (0 or 1) + ) dsp ( + // Cascade: 30-bit (each) output: Cascade Ports + .ACOUT(), // 30-bit output: A port cascade output + .BCOUT(), // 18-bit output: B port cascade output + .CARRYCASCOUT(), // 1-bit output: Cascade carry output + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output + .PCOUT(), // 48-bit output: Cascade output + + // Control: 1-bit (each) output: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc output + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output + .PATTERNDETECT(), // 1-bit output: Pattern detect output + .UNDERFLOW(), // 1-bit output: Underflow in add/acc output + + // Data: 4-bit (each) output: Data Ports + .CARRYOUT(), // 4-bit output: Carry output + .P(pp), // 48-bit output: Primary data output + + // Cascade: 30-bit (each) input: Cascade Ports + .ACIN('x), // 30-bit input: A cascade data input + .BCIN('x), // 18-bit input: B cascade input + .CARRYCASCIN('x), // 1-bit input: Cascade carry input + .MULTSIGNIN('x), // 1-bit input: Multiplier sign input + .PCIN('x), // 48-bit input: P cascade input + + // Control: 4-bit (each) input: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock input + .ALUMODE('0), // 4-bit input: ALU control input + .CARRYINSEL('0), // 3-bit input: Carry select input + .INMODE(5'b01100), // 5-bit input: INMODE control input + .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input + + // Data: 30-bit (each) input: Data Ports + .A(aa), // 30-bit input: A data input + .B(bb), // 18-bit input: B data input + .C('x), // 48-bit input: C data input + .CARRYIN('0), // 1-bit input: Carry input signal + .D(dd), // 25-bit input: D data input + + // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable input for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE + .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable input for CREG + .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable input for DREG + .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG + .CEM(en), // 1-bit input: Clock enable input for MREG + .CEP(en), // 1-bit input: Clock enable input for PREG + .RSTA('0), // 1-bit input: Reset input for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + 2: DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + default: initial begin + $error("Unknown version DSP48E%0d.", VERSION); + $finish; + end + endcase + end : genDSP +`endif + + // External Canary Pipeline + logic [1:0] X1 = '{ default: 0 }; + logic [1:0] X2 = '{ default: 0 }; + logic [1:0] X3 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]); + end + end + + // Derive actual cross-lane overflows + assign h3[s] = pp[D[1]+:2] - X3; + + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; + uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; + + // Conclusive high part accumulation + if(PE_REM == 0) begin : genHi + localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]); + end + assign hi4 = Hi4; + end : genHi + else begin : genHiZero + assign hi4 = '0; + end : genHiZero + + for(genvar i = 0; i < 2; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + // Conclusive low part accumulation + if(i >= PE_REM) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 1) assign up4 = Lo4; + else assign lo4 = Lo4; + end : blkLo + else begin : blkLoZero + assign lo4 = '0; + end : blkLoZero + + end + + // Stage #5: Resolve lane totals + logic signed [1:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[1] <= up4 - hi4; + Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; + end + + end : genPipes + +endmodule : mvu_8sx8u_dsp48 diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv new file mode 100644 index 0000000000..53cf71fd5f --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -0,0 +1,430 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. + *****************************************************************************/ + +module mvu_vvu_8sx9_dsp58 #( + bit IS_MVU, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) + bit FORCE_BEHAVIORAL = 0, + + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD, + localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD + ) + ( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations + + // Ouput + output logic vld, + output logic [PE-1:0][ACCU_WIDTH-1:0] p + ); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + +//-------------------- Declare global signals --------------------\\ + localparam int unsigned CHAINLEN = (SIMD+2)/3; + localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length + localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; + uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; + uwire [23:0] b_in_i [PE][CHAINLEN]; + uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator + +//-------------------- Shift register for opmode select signal --------------------\\ + localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) + logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + + always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) begin + L[1+MAX_PIPELINE_STAGES] <= last; + L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; + end + end + assign vld = L[0]; + +//-------------------- Shift register for ZERO flag --------------------\\ + logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + + if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; + end + end + end; + +//-------------------- Buffer for input activations --------------------\\ + localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= + // synthesis translate_off + zero ? '1 : + // synthesis translate_on + a[SIMD*k + 3*i +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[i][EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + //w[i][3*j +: LANES_OCCUPIED]; + w[SIMD*i+3*j +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; + end + end + for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genExternalPregWeight + else begin : genInpDSPWeight + for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + //PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genInpDSPWeight + end : genWeightSIMD + end : genWeightPE + +//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ + for (genvar i=0; i0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; + localparam bit FIRST = j == 0; + localparam bit LAST = j == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[i] = pp[ACCU_WIDTH-1:0]; + end + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input A/B + logic signed [33:0] Areg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Areg <= '{ default : 0}; + else if (en) begin + Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }; + if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; + end + end + logic signed [23:0] Breg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Breg <= '{ default : 0}; + else if (en) begin + Breg[0] <= b_in_i[i][j]; + if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; + end + end + + // Stage #2: Multiply-Accumulate + logic signed [57:0] Mreg; + logic InmodeZero = 0; + always_ff @(posedge clk) begin + if (rst) InmodeZero <= 0; + else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); + end + always_ff @(posedge clk) begin + if (rst) Mreg <= 0; + else if (en) begin + automatic logic signed [57:0] m = 0; + for (int k = 0; k < 3; k++) begin + m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); + end + Mreg <= m; + end + end + + // Stage #3: Accumulate + logic signed [57:0] Preg; + logic Opmode = 0; + if (FIRST && !LAST) begin : genFirst + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg; + end + end + else assign Preg = Mreg; + end + else if (FIRST && LAST) begin : genSingle + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; + end + end + else if (!FIRST && LAST) begin : genLast + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; + end + end + else begin : genMid + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg + pcout[i][j-1]; + end + end + else assign Preg = Mreg + pcout[i][j-1]; + end + assign pp = Preg; + assign pcout[i][j] = Preg; + end : genBehav +`ifndef VERILATOR + else begin: genDSP + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[i][j]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }), // 34-bit input: A data + .B(b_in_i[i][j]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSP +`endif + end : genDSPChain + end : genDSPPE + +endmodule : mvu_vvu_8sx9_dsp58 diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv new file mode 100644 index 0000000000..91e3b77216 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -0,0 +1,375 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. + * @details + * The following compute cores are supported: + * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, + * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, + * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, + * 'unconstrained' LUT-based MVU and VVU. + * Folding hints: + * - PE scaling should divide MH. + * - SIMD scaling should divide MW. + * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to + * impact critical paths more than PE scaling. PE scaling implies a + * bigger fanout on the input activations. + * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated + *****************************************************************************/ + +module mvu_vvu_axi #( + bit IS_MVU, + parameter COMPUTE_CORE, + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned SEGMENTLEN = 0, + + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + + bit PUMPED_COMPUTE = 0, // requires an even SIMD % 2 == 0 + bit FORCE_BEHAVIORAL = 0, + bit M_REG_LUT = 1, + + // Safely deducible parameters + localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned OUTPUT_STREAM_WIDTH = PE*ACCU_WIDTH, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7)/8 * 8, + localparam bit SIMD_UNEVEN = SIMD % 2 +)( + // Global Control + input logic ap_clk, + input logic ap_clk2x, // synchronous, double-speed clock; only used for PUMPED_COMPUTE + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (ACTIVATION_WIDTH > 8) begin + if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); + $finish; + end + end + if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + if (!IS_MVU) begin + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin + $error("VVU only supported on DSP58 or LUT-based implementation"); + $finish; + end + end + end + + uwire clk = ap_clk; + uwire clk2x = ap_clk2x; + uwire rst = !ap_rst_n; + + //- Replay to Accommodate Neuron Fold ----------------------------------- + typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; + uwire mvu_flatin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NF = MH/PE; + replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + + //- Unflatten inputs into structured matrices --------------------------- + localparam int unsigned ACT_PE = IS_MVU? 1 : PE; + typedef logic [PE -1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t; + typedef logic [ACT_PE-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t; + + uwire mvu_w_t mvu_w = s_axis_weights_tdata; + + //- Conditional Activations Layout Adjustment for VVU + uwire mvu_a_t amvau_i; + if (IS_MVU || (PE == 1)) begin : genMVUInput + assign amvau_i = amvau; + end : genMVUInput + else begin : genVVUInput + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i) + for(genvar pe = 0; pe < ACT_PE; pe++) begin + for(genvar simd = 0; simd < SIMD; simd++) begin + assign amvau_i[pe][simd] = amvau[simd*ACT_PE+pe]; + end + end + end : genVVUInput + + //- Flow Control Bracket around Compute Core ---------------------------- + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + + //- Conditionally Pumped DSP Compute ------------------------------------ + typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; + uwire ovld; + uwire dsp_p_t odat; + if(1) begin : blkDsp + localparam int unsigned EFFECTIVE_SIMD = SIMD_UNEVEN && PUMPED_COMPUTE ? SIMD+1 : SIMD; + localparam int unsigned DSP_SIMD = EFFECTIVE_SIMD/(PUMPED_COMPUTE+1); + typedef logic [PE -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH -1:0] dsp_w_t; + typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] dsp_a_t; + + uwire dsp_clk; + uwire dsp_en; + + uwire dsp_last; + uwire dsp_zero; + uwire dsp_w_t dsp_w; + uwire dsp_a_t dsp_a; + + uwire dsp_vld; + uwire dsp_p_t dsp_p; + + if(!PUMPED_COMPUTE) begin : genUnpumpedCompute + assign dsp_clk = clk; + assign dsp_en = en; + + assign dsp_last = alast && avld; + assign dsp_zero = !istb; + assign dsp_w = mvu_w; + assign dsp_a = amvau_i; + + assign ovld = dsp_vld; + assign odat = dsp_p; + end : genUnpumpedCompute + else begin : genPumpedCompute + assign dsp_clk = clk2x; + + // Identify second fast cycle just before active slow clock edge + logic Active = 0; + if(1) begin : blkActive + uwire clk_lut[2]; // Put some LUT delay on the input from the fast clock net + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk)); + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0])); + always_ff @(posedge clk2x) Active <= clk_lut[1]; + end : blkActive + + // The input for a slow cycle is split across two fast cycles along the SIMD dimension. + // - Both fast cycles are controlled by the same enable state. + // - A zero cycle is duplicated across both fast cycles. + // - The last flag must be restricted to the second fast cycle. + + dsp_w_t W = 'x; + for(genvar pe = 0; pe < PE; pe++) begin : genPERegW + + uwire [2*DSP_SIMD-1:0][WEIGHT_WIDTH-1:0] w; + for(genvar i = 0; i < SIMD; i++) assign w[i] = mvu_w[pe][i]; + for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign w[i] = 0; + + always_ff @(posedge clk2x) begin + if(rst) W[pe] <= 'x; + else if(en) W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + end + + end : genPERegW + + dsp_a_t A = 'x; + for(genvar pe = 0; pe < ACT_PE; pe++) begin : genPERegA + + uwire [2*DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] a; + for(genvar i = 0; i < SIMD; i++) assign a[i] = amvau_i[pe][i]; + for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign a[i] = 0; + + always_ff @(posedge clk2x) begin + if(rst) A[pe] <= 'x; + else if(en) A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + end + + end : genPERegA + + logic Zero = 1; + logic Last = 0; + always_ff @(posedge clk2x) begin + if(rst) begin + Zero <= 1; + Last <= 0; + end + else if(en) begin + Zero <= !istb; + Last <= alast && avld && Active; + end + end + + assign dsp_en = en; + assign dsp_last = Last; + assign dsp_zero = Zero; + assign dsp_w = W; + assign dsp_a = A; + + // Since no two consecutive last cycles will ever be asserted on the input, + // valid outputs will also always be spaced by, at least, one other cycle. + // We can always hold a captured output for two cycles to allow the slow + // clock to pick it up. + logic Vld = 0; + dsp_p_t P = 'x; + always_ff @(posedge clk2x) begin + if(rst) begin + Vld <= 0; + P <= 'x; + end + else if(en) begin + if(dsp_vld) P <= dsp_p; + Vld <= dsp_vld || (Vld && !Active); + end + end + assign ovld = Vld; + assign odat = P; + + end : genPumpedCompute + + case(COMPUTE_CORE) + "mvu_vvu_8sx9_dsp58": + mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_4sx4u": + mvu_4sx4u #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_8sx8u_dsp48": + mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_vvu_lut": + mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + default: initial begin + $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); + $finish; + end + endcase + + end : blkDsp + +//-------------------- Output register slice --------------------\\ + // Make `en`computation independent from external inputs. + // Drive all outputs from registers. + struct packed { + logic rdy; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ rdy: 1, default: 'x }; // side-step register used when encountering backpressure + struct packed { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x }; // ultimate output register + + assign en = A.rdy; + uwire b_load = !B.vld || m_axis_output_tready; + + always_ff @(posedge clk) begin + if(rst) begin + A <= '{ rdy: 1, default: 'x }; + B <= '{ vld: 0, default: 'x }; + end + else begin + if(A.rdy) A.dat <= odat; + A.rdy <= (A.rdy && !ovld) || b_load; + + if(b_load) begin + B <= '{ + vld: ovld || !A.rdy, + dat: A.rdy? odat : A.dat + }; + end + end + end + assign m_axis_output_tvalid = B.vld; + // Why would we need a sign extension here potentially creating a higher signal load into the next FIFO? + // These extra bits should never be used. Why not 'x them out? + assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; + +endmodule : mvu_vvu_axi diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v new file mode 100644 index 0000000000..ee067fa8b5 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -0,0 +1,97 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU & VVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter IS_MVU = $IS_MVU$, + parameter COMPUTE_CORE = "$COMPUTE_CORE$", + parameter PUMPED_COMPUTE = 0, + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + // (* X_INTERFACE_PARAMETER = "ASSOCIATED_RESET ap_rst_n" *) + // (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *) + // input ap_clk2x, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + // Weight Stream + input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, + input weights_V_TVALID, + output weights_V_TREADY, + // Input Stream + input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, + input in0_V_TVALID, + output in0_V_TREADY, + // Output Stream + output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, + output out_V_TVALID, + input out_V_TREADY +); + +mvu_vvu_axi #( + .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) + ) inst ( + .ap_clk(ap_clk), + .ap_clk2x(1'b0), + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(weights_V_TDATA), + .s_axis_weights_tvalid(weights_V_TVALID), + .s_axis_weights_tready(weights_V_TREADY), + .s_axis_input_tdata(in0_V_TDATA), + .s_axis_input_tvalid(in0_V_TVALID), + .s_axis_input_tready(in0_V_TREADY), + .m_axis_output_tdata(out_V_TDATA), + .m_axis_output_tvalid(out_V_TVALID), + .m_axis_output_tready(out_V_TREADY) +); + +endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv new file mode 100644 index 0000000000..3e2766f63d --- /dev/null +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -0,0 +1,181 @@ +/****************************************************************************** + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Replay buffer for counted sequences on an AXI-lite stream. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer #( + int unsigned LEN, // Sequence length + int unsigned REP, // Sequence replay count + int unsigned W // Data width +)( + input logic clk, + input logic rst, + + input logic [W-1:0] idat, + input logic ivld, + output logic irdy, + + output logic [W-1:0] odat, + output logic olast, + output logic ofin, + output logic ovld, + input logic ordy +); + + if(LEN == 0) initial begin + $error("%m: Illegal zero sequence LEN."); + $finish; + end + if(REP == 0) initial begin + $error("%m: Illegal zero REP count."); + $finish; + end + + // Track position in Sequence + uwire last_item; + uwire shift; + if(LEN == 1) assign last_item = 1; + else begin + typedef logic [$clog2(LEN)-1:0] count_t; + count_t Count = 0; + logic Last = 0; + always_ff @(posedge clk) begin + if(rst) begin + Count <= 0; + Last <= 0; + end + else if(shift) begin + Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1); + Last <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last); + end + end + assign last_item = Last; + end + + if(REP == 1) begin + assign shift = ivld && ordy; + + assign irdy = ordy; + assign odat = idat; + assign olast = last_item; + assign ofin = last_item; + assign ovld = ivld; + end + else begin + + // Track Repetitions + uwire last_rep; + if(1) begin : blkRep + typedef logic [$clog2(REP)-1:0] rep_t; + rep_t RepCnt = 0; + logic RepLst = 0; + always_ff @(posedge clk) begin + if(rst) begin + RepCnt <= 0; + RepLst <= 0; + end + else if(last_item && shift) begin + RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1); + RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst); + end + end + assign last_rep = RepLst; + end : blkRep + + localparam int unsigned AWIDTH = LEN < 2? 1 : $clog2(LEN); + typedef logic [AWIDTH :0] ptr_t; // pointers with additional generational MSB + typedef logic [W -1:0] data_t; + + // Output Registers + data_t ODat; + logic OVld = 0; + logic OLst = 'x; + logic OFin = 'x; + assign odat = ODat; + assign olast = OLst; + assign ofin = OFin; + assign ovld = OVld; + + // Buffer Memory Management + data_t Mem[2**AWIDTH]; + ptr_t WP = 0; // Write Pointer + ptr_t RP = 0; // Read Pointer + ptr_t FP = 0; // Free Pointer + + // Operational Guards + // Occupancy: WP-FP + // WP-FP < 2**AWIDTH -> writing allowed + // - increments WP + // Availability: WP-RP + // WP-RP > 0 -> reading allowed + // - increments RP, last in sequence rewinds to FP for non-final repetition + // - increments FP in last repetition + assign irdy = !((WP-FP) >> AWIDTH); + + uwire wr = irdy && ivld; + uwire rd = !OVld || ordy; + always_ff @(posedge clk) begin + if(wr) Mem[WP[AWIDTH-1:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; + end + + uwire vld = (RP != WP); + assign shift = rd && vld; + always_ff @(posedge clk) begin + if(rst) begin + WP <= 0; + RP <= 0; + FP <= 0; + + OVld <= 0; + OLst <= 'x; + OFin <= 'x; + end + else begin + if(wr) WP <= WP + 1; + if(rd) begin + if(vld) begin + automatic logic rewind = last_item && !last_rep; + RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1); + FP <= FP + last_rep; + end + + OVld <= vld; + OLst <= last_item; + OFin <= last_rep && last_item; + end + end + end + + end + +endmodule : replay_buffer From b295329694dc19eb97cb1fc76b8e57426cca4101 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 4 Mar 2024 11:02:43 +0000 Subject: [PATCH 173/291] [tb]: testbench for replay_buffer and mvu/vvu layers --- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 239 +++++++++++++++++++++++++ finn-rtllib/mvu/tb/replay_buffer_tb.sv | 130 ++++++++++++++ finn-rtllib/mvu/tb/vvu_axi_tb.sv | 227 +++++++++++++++++++++++ 3 files changed, 596 insertions(+) create mode 100644 finn-rtllib/mvu/tb/mvu_axi_tb.sv create mode 100644 finn-rtllib/mvu/tb/replay_buffer_tb.sv create mode 100644 finn-rtllib/mvu/tb/vvu_axi_tb.sv diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv new file mode 100644 index 0000000000..2f35a112ab --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -0,0 +1,239 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module mvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam bit IS_MVU = 1; + localparam string COMPUTE_CORE = "mvu_4sx4u"; + localparam int unsigned MW = 120; + localparam int unsigned MH = 40; + localparam int unsigned SIMD = 20; + localparam int unsigned PE = 10; + localparam int unsigned SEGMENTLEN = 2.0; + localparam bit FORCE_BEHAVIORAL = 1; + localparam bit M_REG_LUT = 1; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 0; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = 'X; + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]); + // else + // res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : + // $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]); + // end + // end + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]); + else + res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]); + end + end + end + end + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + + int unsigned NF_CNT = 0; + initial begin + outputs.rdy = 0; + while (NF_CNT < NF) begin + // Loop until both rdy & vld are asserted + do begin + outputs.rdy <= $urandom()%7 >= 0; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(COMPUTE_CORE), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : mvu_axi_tb \ No newline at end of file diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv new file mode 100644 index 0000000000..5581354e0e --- /dev/null +++ b/finn-rtllib/mvu/tb/replay_buffer_tb.sv @@ -0,0 +1,130 @@ +/****************************************************************************** + * Copyright (C) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for replay_buffer module. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer_tb; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + uwire rst = 0; + + // DUT Geometries + localparam int unsigned DIMS[3] = '{ 7, 8, 10 }; + localparam int unsigned W = 8; + typedef logic [W-1:0] data_t; + + bit [2**$size(DIMS)-1:0] done = 0; + always_comb begin + if(&done) begin + $display("Test completed."); + $finish; + end + end + + // Parallel DUT Instantiations + for(genvar r = 0; r < $size(DIMS); r++) begin + for(genvar l = 0; l < $size(DIMS); l++) begin + localparam int unsigned REP = DIMS[r]; + localparam int unsigned LEN = DIMS[l]; + + data_t idat; + logic ivld; + uwire irdy; + + uwire data_t odat; + uwire olast; + uwire ofin; + uwire ovld; + logic ordy; + + replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut ( + .clk, .rst, + .idat, .ivld, .irdy, + .odat, .olast, .ofin, .ovld, .ordy + ); + + // Input Feed: 0, 1, ..., 10*LEN-1 + initial begin + idat = 'x; + ivld = 0; + @(posedge clk iff !rst); + + for(int unsigned i = 0; i < 10*LEN; i++) begin + idat <= i; + ivld <= 1; + @(posedge clk iff irdy); + idat <= 'x; + ivld <= 0; + while($urandom()%(REP-1) != 0) @(posedge clk); + end + end + + // Output Check + initial begin + automatic int unsigned base = 0; + + ordy = 0; + @(posedge clk iff !rst); + + for(int unsigned k = 0; k < 10; k++) begin + for(int unsigned j = 0; j < REP; j++) begin + for(int unsigned i = 0; i < LEN; i++) begin + ordy <= 1; + @(posedge clk iff ovld); + assert(odat == base+i) else begin + $error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i); + $stop; + end + assert(olast == (i == LEN-1)) else begin + $error("#%0d.%0d: Last mismatch.", r, l); + $stop; + end + assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin + $error("#%0d.%0d: Fin mismatch.", r, l); + $stop; + end + + ordy <= 0; + while($urandom()%13 == 0) @(posedge clk); + end + end + base += LEN; + end + + done[$size(DIMS)*r + l] <= 1; + end + end + end + +endmodule : replay_buffer_tb diff --git a/finn-rtllib/mvu/tb/vvu_axi_tb.sv b/finn-rtllib/mvu/tb/vvu_axi_tb.sv new file mode 100644 index 0000000000..fbb45845e1 --- /dev/null +++ b/finn-rtllib/mvu/tb/vvu_axi_tb.sv @@ -0,0 +1,227 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI-lite interface wrapper. + *****************************************************************************/ + +module vvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam bit IS_MVU = 0; + localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; + localparam int unsigned MW = 25; // Kernel*Kernel + localparam int unsigned MH = 4; // Channels + localparam int unsigned SIMD = 25; // MW%SIMD == 0 + localparam int unsigned PE = 2; // MH%PE == 0 + localparam int unsigned SEGMENTLEN = 3.0; + localparam bit FORCE_BEHAVIORAL = 1; + localparam bit M_REG_LUT = 1; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (PE*SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - PE*SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[NF*SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = 'X; + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = $signed(res[i][k]) + $signed(a[i*SF+j][k + l*PE]) * $signed(w[i][j][k][l]); + else + res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[i*SF+j][k + l*PE]}) * $signed(w[i][j][k][l]); + end + end + end + end + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + + int unsigned NF_CNT = 0; + initial begin + outputs.rdy = 0; + while (NF_CNT < NF) begin + // Loop until both rdy & vld are asserted + do begin + outputs.rdy <= $urandom()%7 >= 0; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(COMPUTE_CORE), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : vvu_axi_tb From 7cf62c7017d146bd50377d05eb5689a49604629e Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 4 Mar 2024 12:21:14 +0000 Subject: [PATCH 174/291] [Tests] Specialize layers before checksum hook insertion --- tests/fpgadataflow/test_fpgadataflow_checksum.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py index c51030764c..81a4e3e33c 100644 --- a/tests/fpgadataflow/test_fpgadataflow_checksum.py +++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py @@ -141,6 +141,7 @@ def test_fpgadataflow_checksum(): # use a graph consisting of two fc layers to test # checksum node insertion model = create_two_fc_model() + model = model.transform(SpecializeLayers()) # set checksum output hook for n in model.graph.node: From 83fe7e83e8bbe7d2044b2d15520753530362cde3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 4 Mar 2024 12:23:22 +0000 Subject: [PATCH 175/291] [rtl mvu]: added MVU_rtl layer --- .../rtl/matrixvectoractivation_rtl.py | 307 ++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py new file mode 100644 index 0000000000..ae04b003bd --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -0,0 +1,307 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk + +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.util.fpgadataflow import is_versal + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +# ONNX i/o tensor shape assumptions for MatrixVectorActivation: +# input 0 is the input tensor, shape (.., i_size) = (..., MW) +# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) +# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) +# output 0 is the output tensor, shape (.., o_size) = (..., MH) +# the ... here can be any shape (representing groups of vectors) + + +class MVAU_rtl(MVAU, RTLBackend): + """Class that corresponds to finn-rtl Matrix Vector Unit.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(MVAU.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + if mode == "cppsim": + raise Exception("cppsim not possible for RTL MVAU, please set exec_mode to rtlsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for MatrixVectorActivation_rtl") + in_ind += 1 + + if mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + reset_rtlsim(sim) + toggle_clk(sim) + if mem_mode in ["external", "decoupled"]: + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to "rtlsim" """.format( + mode + ) + ) + + def lut_estimation(self): + return 0 + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + + def code_generation_ipgen(self, model, fpgapart, clk): + self.generate_hdl(model, fpgapart, clk) + + def instantiate_ip(self, cmd): + # instantiate the RTL IP + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + + def _resolve_segment_len(self, clk): + # Insert pipeline registers in the DSP58 chain to meet target clock frequency + # ~0.741 ns seems the worst-case delay through first DSP + # ~0.605 ns seems to be (on average) delay for all subsequent DSPs + # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 + assert ( + clk > 0.741 + ), """Infeasible clk target of {} ns has been set, + consider lowering the targeted clock frequency!""".format( + clk + ) + critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) + max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) + dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len + return dsp_chain_len + + def _resolve_impl_style(self, fpgapart): + # Based on target device and activation/weight-width, choose the + # supported RTL compute core + assert ( + self.get_nodeattr("resType") != "lut" + ), """LUT-based RTL-MVU implementation currently not supported! + Please change resType for {}""".format( + self.onnx_node.name + ) + + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal_family = is_versal(fpgapart) + + if is_versal_family: + return "mvu_vvu_8sx9_dsp58" + else: + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + if (act_width == 4 and weight_width == 4) and not (is_versal_family): + return "mvu_4sx4u" + else: + return "mvu_8sx8u_dsp48" + + def generate_hdl(self, model, fpgapart, clk): + # Generate params as part of IP preparation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.generate_params(model, code_gen_dir) + + template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) + # add general parameters to dictionary + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + + # apply code generation to template + with open(template_path, "r") as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_codegen_default(self, fpgapart, clk): + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" + + code_gen_dict = {} + code_gen_dict["$IS_MVU$"] = [str(1)] + code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] + # code_gen_dict["$PUMPED_COMPUTE$"] = [str(0)] + code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] + code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] + code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] + code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] + code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] + code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( + [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + ) + code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] + + return template_path, code_gen_dict + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + + return sim From 4f19aa44238fd62b70e4f7cfeff12590694ce380 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 4 Mar 2024 12:32:35 +0000 Subject: [PATCH 176/291] [Tests] Fix for cppsim with impl style rtl in SWG --- tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 1fe96d6bd7..5de6e7c1d1 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -217,7 +217,10 @@ def test_fpgadataflow_slidingwindow( # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] - if dw == 0: + # if cppsim and impl style rtl is selected, the node execution is done by the hw op parent + # so, no reordering/shaping of the output is needed + # because there is no concept of SIMD parallelism in the hw abstraction layer execution + if dw == 0 or (impl_style == "rtl" and exec_mode == "cppsim"): assert (y_produced == y_expected).all() else: y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd) From 649c4284b5348d159fc8819ce6690444c09ffc29 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 4 Mar 2024 15:30:53 +0000 Subject: [PATCH 177/291] [test]: added mvau_rtl test case --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 101 +++++++++---------- 1 file changed, 45 insertions(+), 56 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 85cca66835..5f979e0b76 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -26,8 +26,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os -import pickle import pytest import numpy as np @@ -37,7 +35,12 @@ from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames, GiveReadableTensorNames, ApplyConfig +from qonnx.transformation.general import ( + ApplyConfig, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) +from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.util.basic import ( calculate_signed_dot_prod_range, gen_finn_dt_tensor, @@ -45,19 +48,25 @@ ) import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode -from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths -from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None): @@ -394,6 +403,7 @@ def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) inst.set_nodeattr("rtlsim_trace", "mvau_trace.vcd") + inst.set_nodeattr("preferred_impl_style", "hls") # prepare input data input_dict = prepare_inputs(x, idt, wdt) @@ -610,36 +620,26 @@ def test_mvau_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh, b assert chrc_out[0, exp_total_cycles] == nf -# @pytest.mark.parametrize("mh", [36]) -# @pytest.mark.parametrize("mw", [256]) -@pytest.mark.parametrize("mh", [1]) -@pytest.mark.parametrize("mw", [8]) -# @pytest.mark.parametrize("pe", [1, 4, 9, 36]) -# @pytest.mark.parametrize("simd", [1, 4, 16, 64, 256]) -# @pytest.mark.parametrize("pe", [1, 3, 9]) -# @pytest.mark.parametrize("simd", [1, 3, 6, 18, 36]) -@pytest.mark.parametrize("pe", [1]) -@pytest.mark.parametrize("simd", [4]) -# @pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) -# @pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]]) -@pytest.mark.parametrize("idt", [DataType["UINT4"]]) -@pytest.mark.parametrize("wdt", [DataType["INT4"]]) -# @pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"]) -@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S"]) -# @pytest.mark.parametrize("clk_ns", [1.66, 4]) -@pytest.mark.parametrize("clk_ns", [4]) +@pytest.mark.parametrize("mh", [18]) +@pytest.mark.parametrize("mw", [128]) +@pytest.mark.parametrize("pe", [1, 6, 9, 18]) +@pytest.mark.parametrize("simd", [1, 4, 16, 64, 128]) +@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) +@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]]) +@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"]) +@pytest.mark.parametrize("clk_ns", [1.66, 4]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_rtl_mvau( - mh, mw, pe, simd, idt, wdt, part, clk_ns -): +def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66: - pytest.skip("Skip test for varying clk for devices other than Versal, since this variable doesn't change anything for this test") + pytest.skip( + """Skip test for varying clk for devices other than Versal, + since this variable only affects DSP58s""" + ) - build_dir = os.environ["FINN_BUILD_DIR"] # Create test input vector (produced by SWG) - ofm_shape = (2, 2) + ofm_shape = (3, 3) ofm_h, ofm_w = ofm_shape ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw]) ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh)) @@ -648,18 +648,15 @@ def test_fpgadataflow_rtl_mvau( model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) - model.save(build_dir + "/matmul.onnx") - # Create MatMul & obtain golden reference output - A = gen_finn_dt_tensor(model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in")) + A = gen_finn_dt_tensor( + model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in") + ) input_dict = prepare_inputs(A, idt, wdt, inp_name="global_in") # Execute ONNX model output_matmul = oxe.execute_onnx(model, input_dict)["global_out"] - with open(build_dir + "/onnx_output.pkl", "wb") as f: - pickle.dump(output_matmul, f) - # Create MVAU (HLS) model = model.transform(to_hw.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) model = model.transform(GiveUniqueNodeNames()) @@ -671,46 +668,38 @@ def test_fpgadataflow_rtl_mvau( "PE": pe, "SIMD": simd, "mem_mode": "decoupled", - "ram_style": "auto", "resType": "dsp", - "preferred_impl_style" : "rtl" + "preferred_impl_style": "rtl", }, } model = model.transform(ApplyConfig(folding_config)) - model.save(build_dir + "/mvau_hls.onnx") + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) + # make sure the changed datatypes are propagated through the network + model = model.transform(InferDataTypes()) # Apply convert-to-rtl step - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers(part)) model = model.transform(GiveUniqueNodeNames()) - model.save(build_dir + "/mvau_rtl.onnx") - - # Reset rtlsim_so and ip-related paths such that new Pyverilator SO and IP is generated - for n in model.graph.node: - getCustomOp(n).set_nodeattr("rtlsim_trace", build_dir + "/mvu_trace_rtl_nodebynode.vcd") - model = model.transform(SetExecMode("rtlsim")) model = model.transform(PrepareIP(part, clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"] - with open(build_dir + "/mvau_rtl_output.pkl", "wb") as f: - pickle.dump(output_mvau_rtl, f) - - model.save(build_dir + "/mvau_rtl_sim.onnx") - import pdb; pdb.set_trace() - assert (output_matmul == output_mvau_rtl).all(), "Output of ONNX model not matching output of node-by-node sim!" + assert ( + output_matmul == output_mvau_rtl + ).all(), "Output of ONNX model not matching output of node-by-node sim!" model = model.transform(InsertAndSetFIFODepths(part, clk_ns)) model = model.transform(PrepareIP(part, clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(part, clk_ns)) - os.environ["RTLSIM_TRACE_DEPTH"] = "3" model.set_metadata_prop("rtlsim_so", "") model.set_metadata_prop("exec_mode", "rtlsim") - model.set_metadata_prop("rtlsim_trace", build_dir + "/mvu_trace_rtl_stitch.vcd") - model.save(build_dir + "/stitched_ip.onnx") output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"] - assert (output_matmul == output_mvau_rtl_stitch).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" \ No newline at end of file + assert ( + output_matmul == output_mvau_rtl_stitch + ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" From 87f551fc3f237ffb35e8df823c8d88e5180d6e7c Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 5 Mar 2024 11:13:36 +0000 Subject: [PATCH 178/291] [Pre-commit] Run linting --- .../fpgadataflow/rtl/thresholding_rtl.py | 46 +++++----- .../custom_op/fpgadataflow/thresholding.py | 5 +- src/finn/util/basic.py | 4 +- .../test_convert_to_hw_thresholding.py | 2 +- .../test_fpgadataflow_thresholding.py | 83 ++++++++++--------- 5 files changed, 77 insertions(+), 63 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 4adde1452d..6ee940883a 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -31,7 +31,7 @@ import os import shutil import warnings -from pyverilator.util.axi_utils import rtlsim_multi_io, reset_rtlsim +from pyverilator.util.axi_utils import reset_rtlsim, rtlsim_multi_io from qonnx.core.datatype import DataType from qonnx.util.basic import roundup_to_integer_multiple @@ -76,7 +76,7 @@ def get_nodeattr_types(self): # setting to 0 may save some FFs but otherwise leave on "deep_pipeline": ("i", False, 1, {0, 1}), } - my_attrs.update(Thresholding.get_nodeattr_types(self)) + my_attrs.update(Thresholding.get_nodeattr_types(self)) my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs @@ -314,7 +314,7 @@ def generate_hdl(self, model): weights = model.get_initializer(self.onnx_node.input[1]) weights_fname = f"{code_gen_dir}/memblock.dat" - self.make_weight_file(weights,"decoupled", weights_fname) + self.make_weight_file(weights, "decoupled", weights_fname) for rtl_file_path in self.get_rtl_file_paths(): # read in original RTL template file @@ -346,7 +346,9 @@ def prepare_rtlsim(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") verilog_paths = [code_gen_dir] - verilog_files = [x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list()] + verilog_files = [ + x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list() + ] dat_files = self.get_all_meminit_filenames(abspath=True) single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") for dat_file in dat_files: @@ -376,7 +378,9 @@ def execute_node(self, context, graph): ) mode = self.get_nodeattr("exec_mode") if mode == "cppsim": - raise Exception("cppsim not possible for RTL Thresholding, please set exec_mode to rtlsim") + raise Exception( + "cppsim not possible for RTL Thresholding, please set exec_mode to rtlsim" + ) elif mode == "rtlsim": code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") else: @@ -442,13 +446,14 @@ def execute_node(self, context, graph): os.chdir(so_dir) num_out_values = self.get_number_output_values() reset_rtlsim(sim) - total_cycle_count = rtlsim_multi_io(sim, - io_dict, - num_out_values, - trace_file=trace_file, - sname=sname, - liveness_threshold=pyverilate_get_liveness_threshold_cycles() - ) + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + num_out_values, + trace_file=trace_file, + sname=sname, + liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + ) self.set_nodeattr("cycles_rtlsim", total_cycle_count) os.chdir(olcwd) output = io_dict["outputs"][ostream_name] @@ -472,7 +477,9 @@ def execute_node(self, context, graph): def code_generation_ipi(self): """Constructs and returns the TCL commands for node instantiation as an RTL block.""" - rtl_file_list = [x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list()] + rtl_file_list = [ + x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list() + ] code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name cmd = ["file mkdir %s" % source_target] @@ -565,6 +572,7 @@ def generate_params(self, model, path): """Please set mem_mode to "const", "decoupled", currently no other parameter value is supported!""" ) + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights (thresholds) in appropriate format for this layer. This file can be used for either synthesis or @@ -587,24 +595,24 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): n_thres_steps = self.get_nodeattr("numSteps") width_padded = roundup_to_integer_multiple(weights.shape[1], 4) - weight_padded = np.zeros((weights.shape[0],width_padded)) - weight_padded[:weights.shape[0], :n_thres_steps ] = weights + weight_padded = np.zeros((weights.shape[0], width_padded)) + weight_padded[: weights.shape[0], :n_thres_steps] = weights weight_stream = [] wdt = self.get_weight_datatype() bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32) padding = np.zeros(width_padded, dtype=np.int32) chan_ind = 0 - cf = ch//pe + cf = ch // pe for fold in range(cf): - for c in range(2**(pe-1).bit_length()): - if (c==0 or c%pe != 0) and c < pe: + for c in range(2 ** (pe - 1).bit_length()): + if (c == 0 or c % pe != 0) and c < pe: for w in weight_padded[chan_ind]: w_packed = pack_innermost_dim_as_hex_string( [w], wdt, bw_hexdigit, prefix="" ).item() weight_stream.append(w_packed) - chan_ind +=1 + chan_ind += 1 else: for z in padding: w_packed = pack_innermost_dim_as_hex_string( diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index d3ba724818..822bb1476f 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -31,6 +31,7 @@ from qonnx.core.datatype import DataType from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions + from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp @@ -165,7 +166,6 @@ def get_outstream_width(self, ind=0): return o_bits * self.get_nodeattr("PE") def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") fold = self.calc_tmem() vecs = list(self.get_nodeattr("numInputVectors")) @@ -194,7 +194,6 @@ def get_exp_cycles(self): # Channels/PE * batch size * fmdim * fmdim return np.prod(self.get_folded_output_shape()[:-1]) - def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): """Convert the original numpy weight matrix orig_weight_matrix into a form suitable for passing to the hlslib call: @@ -259,4 +258,4 @@ def calc_tmem(self): """Calculates and returns TMEM.""" num_channels = self.get_nodeattr("NumChannels") pe = self.get_nodeattr("PE") - return num_channels // pe \ No newline at end of file + return num_channels // pe diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 49220e9718..10edb7dc54 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -270,7 +270,7 @@ def find_next_power_of_2(n): def get_memutil_alternatives( req_mem_spec, mem_primitives=mem_primitives_versal, sort_min_waste=True ): - '''Computes how many instances of a memory primitive are necessary to + """Computes how many instances of a memory primitive are necessary to implement a desired memory size, where req_mem_spec is the desired size and the primitive_spec is the primitve size. The sizes are expressed as tuples of (mem_width, mem_depth). Returns a list of tuples of the form @@ -278,7 +278,7 @@ def get_memutil_alternatives( range [0,1] indicates how much of the total capacity is utilized, and waste indicates how many bits of storage are wasted. If sort_min_waste is True, the list is sorted by increasing waste. - ''' + """ ret = [ (primitive_name, memutil(req_mem_spec, primitive_spec)) for (primitive_name, primitive_spec) in mem_primitives.items() diff --git a/tests/fpgadataflow/test_convert_to_hw_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py index 3f0487f9f7..ee161a9b95 100755 --- a/tests/fpgadataflow/test_convert_to_hw_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hw_thresholding.py @@ -32,10 +32,10 @@ from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes -from qonnx.custom_op.registry import getCustomOp from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 62d7b04278..899773b680 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -42,7 +42,7 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation -from finn.core.rtlsim_exec import rtlsim_exec, reset_rtlsim +from finn.core.rtlsim_exec import rtlsim_exec from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -56,6 +56,7 @@ test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 + def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): return np.random.randint( input_data_type.min(), @@ -63,21 +64,26 @@ def generate_random_threshold_values(input_data_type, num_input_channels, num_st (num_input_channels, num_steps), ).astype(np.float32) + def sort_thresholds_increasing(thresholds): return np.sort(thresholds, axis=1) + # n = batch, c = channel, h = height, w = width of feature map # Standard = NCHW; FINN = NHWC # Convert from NHWC(FINN) to NCHW(Standard) def layout_FINN2NCHW(data): return np.transpose(data, (0, 3, 1, 2)) + # Convert from NCHW(Standard) to NHWC(FINN) def layout_NCHW2FINN(data): return np.transpose(data, (0, 2, 3, 1)) -def make_single_thresholding_modelwrapper(impl_style, T, pe, idt, odt, actval, mem_mode, n_inp_vecs): +def make_single_thresholding_modelwrapper( + impl_style, T, pe, idt, odt, actval, mem_mode, n_inp_vecs +): NumChannels = T.shape[0] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]) @@ -100,7 +106,7 @@ def make_single_thresholding_modelwrapper(impl_style, T, pe, idt, odt, actval, m ActVal=actval, mem_mode=mem_mode, numInputVectors=n_inp_vecs, - preferred_impl_style=impl_style + preferred_impl_style=impl_style, ) graph = helper.make_graph( nodes=[Thresholding_node], @@ -136,7 +142,7 @@ def make_single_thresholding_modelwrapper(impl_style, T, pe, idt, odt, actval, m @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_mode): +def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem_mode): if impl_style == "rtl" and exec_mode == "cppsim": pytest.skip("rtl implstyle has no cppsim, skipping") if nf == -1: @@ -152,9 +158,7 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ n_steps = act.get_num_possible_values() - 1 # Generate random, non-decreasing thresholds - thresholds = generate_random_threshold_values( - idt, ich, n_steps - ) + thresholds = generate_random_threshold_values(idt, ich, n_steps) thresholds = sort_thresholds_increasing(thresholds) @@ -165,15 +169,8 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ # Build DUT model = make_single_thresholding_modelwrapper( - impl_style, - thresholds, - pe, - idt, - odt, - actval, - mem_mode, - n_inp_vecs - ) + impl_style, thresholds, pe, idt, odt, actval, mem_mode, n_inp_vecs + ) # Expected Reference output # multithreshold util fxn wants NCHW input, not NHWC @@ -238,17 +235,18 @@ def test_fpgadataflow_thresholding(impl_style,idt, act, nf, ich, exec_mode, mem_ assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) assert exp_cycles != 0 + @pytest.mark.parametrize("impl_style", ["rtl", "hls"]) -@pytest.mark.parametrize("cfg", [(1,1), (6,2), (6,3), (8,2), (8,4)]) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 2), (8, 4)]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_read(impl_style,cfg): - """ Read back threshold weights during runtime +def test_runtime_thresholds_read(impl_style, cfg): + """Read back threshold weights during runtime - 1. Create random initial weights T - 2. Execute model - 3. Read back weights via AXI - 4. Compare with initial weights T + 1. Create random initial weights T + 2. Execute model + 3. Read back weights via AXI + 4. Compare with initial weights T """ ch = cfg[0] pe = cfg[1] @@ -268,7 +266,9 @@ def test_runtime_thresholds_read(impl_style,cfg): else: actval = odt.min() - model = make_single_thresholding_modelwrapper(impl_style, T, pe, idt, odt, actval, mem_mode, n_inp_vecs) + model = make_single_thresholding_modelwrapper( + impl_style, T, pe, idt, odt, actval, mem_mode, n_inp_vecs + ) model = model.transform(SpecializeLayers()) # Make sure that specialize layer did not default to HLS implementation @@ -303,6 +303,7 @@ def test_runtime_thresholds_read(impl_style,cfg): exec_ctx = {"inp": in_tensor} extracted_weight_stream = [] + def read_weights(sim): addr = 0 for i in range(len(old_weight_stream)): @@ -331,20 +332,21 @@ def read_weights(sim): # Validate the output is as expected assert (y == expected).all() + @pytest.mark.parametrize("impl_style", ["hls", "rtl"]) -@pytest.mark.parametrize("cfg", [(1,1), (6,2), (6,3), (8,2), (8,4)]) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 2), (8, 4)]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_write(impl_style,cfg): - """ Write threshold weights during runtime - - 1. Create random initial weights T_init - 2. Create model with initial weights - 3. Create new set of weights T_write - 4. Write T_write using AXI bus - 5. Read back using AXI bus to T_read - 6. Compare T_write and T_read - 7. Validate outputs with expected vectors +def test_runtime_thresholds_write(impl_style, cfg): + """Write threshold weights during runtime + + 1. Create random initial weights T_init + 2. Create model with initial weights + 3. Create new set of weights T_write + 4. Write T_write using AXI bus + 5. Read back using AXI bus to T_read + 6. Compare T_write and T_read + 7. Validate outputs with expected vectors """ ch = cfg[0] pe = cfg[1] @@ -366,7 +368,9 @@ def test_runtime_thresholds_write(impl_style,cfg): else: actval = odt.min() - model = make_single_thresholding_modelwrapper(impl_style, T_init, pe, idt, odt, actval, mem_mode, n_inp_vecs) + model = make_single_thresholding_modelwrapper( + impl_style, T_init, pe, idt, odt, actval, mem_mode, n_inp_vecs + ) model = model.transform(SpecializeLayers()) # Validate that specialize layer did not default to HLS implementation @@ -381,7 +385,7 @@ def test_runtime_thresholds_write(impl_style,cfg): # provide non-decreasing thresholds T_write = np.sort(T_write, axis=1) - dat_fname = f"T_write_{cfg}.dat" # distinguish fname per paramter for distributed testing + dat_fname = f"T_write_{cfg}.dat" # distinguish fname per paramter for distributed testing op_inst.make_weight_file(T_write, "decoupled_runtime", dat_fname) with open(dat_fname, "r") as f: T_write_stream = f.read().strip() @@ -407,12 +411,15 @@ def test_runtime_thresholds_write(impl_style,cfg): in_tensor = np.tile(in_tensor, (2, 1, 1, 1)) exec_ctx_write = {"inp": in_tensor} + def write_weights(sim): addr = 0 for nw in T_write_stream: axilite_write(sim, addr, nw, basename="s_axilite_0_") addr += 4 + T_read_stream = [] + def read_weights(sim): addr = 0 for i in range(len(T_write_stream)): @@ -438,4 +445,4 @@ def read_weights(sim): expected += act.min() # Validate the output is as expected - assert (y == expected).all() \ No newline at end of file + assert (y == expected).all() From f8c987ccccafe0b1bff449d325cfd74282e1f428 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 5 Mar 2024 11:53:24 +0000 Subject: [PATCH 179/291] [RTL layers] Pass model by default to generate hdl functionality and clean up rtl thresholding class --- .../rtl/convolutioninputgenerator_rtl.py | 2 +- .../fpgadataflow/rtl/fmpadding_rtl.py | 2 +- .../rtl/streamingdatawidthconverter_rtl.py | 2 +- .../fpgadataflow/rtl/streamingfifo_rtl.py | 2 +- .../fpgadataflow/rtl/thresholding_rtl.py | 203 +++++++----------- src/finn/custom_op/fpgadataflow/rtlbackend.py | 2 +- .../test_fpgadataflow_thresholding.py | 2 - 7 files changed, 88 insertions(+), 127 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py index 08564ca6da..4bce80c658 100755 --- a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py @@ -839,7 +839,7 @@ def select_impl_style(self): return impl_style - def generate_hdl(self): + def generate_hdl(self, model): """Generates HDL code and wrapper for the IP, depending on required implementation style.""" impl_style = self.select_impl_style() diff --git a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py index 19765d64c4..33293f45e1 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py @@ -171,7 +171,7 @@ def get_dynamic_config(self, ifm_dims=None, pads=None): } return config - def generate_hdl(self): + def generate_hdl(self, model): rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fmpadding/hdl" template_path = rtlsrc + "/fmpadding_template.v" dims = self.get_nodeattr("ImgDim") diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py index ef918b5db8..8afc6e7ad5 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py @@ -137,7 +137,7 @@ def get_template_values(self): } return code_gen_dict - def generate_hdl(self): + def generate_hdl(self, model): rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/dwc/hdl" template_path = rtlsrc + "/dwc_template.v" code_gen_dict = self.get_template_values() diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py index a9d9e689eb..581d93394b 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py @@ -82,7 +82,7 @@ def get_verilog_top_module_intf_names(self): ret["ap_none"] = ["maxcount"] return ret - def generate_hdl(self): + def generate_hdl(self, model): rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fifo/hdl" template_path = rtlsrc + "/fifo_template.v" diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 6ee940883a..c39ae74a38 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -30,7 +30,6 @@ import numpy as np import os import shutil -import warnings from pyverilator.util.axi_utils import reset_rtlsim, rtlsim_multi_io from qonnx.core.datatype import DataType from qonnx.util.basic import roundup_to_integer_multiple @@ -116,28 +115,6 @@ def get_memory_estimate(self): res_dict[res_type] = res_dict.get(res_type, 0) + pe * res_count return res_dict - def infer_node_datatype(self, model): - """Used for FINN DataType inference: set the output tensors' datatypes - accordingly for this node""" - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype().name), - str(idt.name), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) - - def verify_node(self): - """Required by the FINN nalysis module. Checks if custom ops in graph - are correctly built, with all attributes and inputs.""" - return [] - def bram_estimation(self): res_dict = self.get_memory_estimate() return res_dict.get("BRAM", 0) @@ -301,9 +278,6 @@ def dump_rtl_data(self, dest_dir, filename, data): f.write(data) return - def code_generation_ipgen(self, model, fpgapart, clk): - self.generate_hdl(model) - def generate_hdl(self, model): """Prepare HDL files from templates for synthesis""" # Generate a dictionary of values to put in RTL template @@ -369,20 +343,92 @@ def prepare_rtlsim(self): return sim def execute_node(self, context, graph): - # Perform input checks - if self.get_nodeattr("exec_mode") != "rtlsim": - raise Exception( - "Invalid exec_mode value: {}; exec_mode must be set to '{}'".format( - self.get_nodeattr("exec_mode"), "rtlsim" - ) - ) mode = self.get_nodeattr("exec_mode") + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if mode == "cppsim": - raise Exception( - "cppsim not possible for RTL Thresholding, please set exec_mode to rtlsim" - ) + Thresholding.execute_node(self, context, graph) elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + node = self.onnx_node + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for Thresholding_rtl") + in_ind += 1 + + # Create a PyVerilator wrapper of the RTLSim .so + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + io_names = self.get_verilog_top_module_intf_names() + istream_name = io_names["s_axis"][0][0] + ostream_name = io_names["m_axis"][0][0] + io_dict = { + "inputs": {istream_name: inp}, + "outputs": {ostream_name: []}, + } + + trace_file = self.get_nodeattr("rtlsim_trace") + if trace_file == "default": + trace_file = self.onnx_node.name + ".vcd" + sname = "_" + + # Change into so directory to ensure threshold files can be found + rtlsim_so = self.get_nodeattr("rtlsim_so") + so_dir = os.path.dirname(os.path.realpath(rtlsim_so)) + olcwd = os.getcwd() + os.chdir(so_dir) + num_out_values = self.get_number_output_values() + reset_rtlsim(sim) + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + num_out_values, + trace_file=trace_file, + sname=sname, + liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + ) + self.set_nodeattr("cycles_rtlsim", total_cycle_count) + os.chdir(olcwd) + output = io_dict["outputs"][ostream_name] + + # Manage output data + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} @@ -390,89 +436,6 @@ def execute_node(self, context, graph): mode ) ) - node = self.onnx_node - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for Thresholding_rtl") - in_ind += 1 - - # Create a PyVerilator wrapper of the RTLSim .so - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - io_names = self.get_verilog_top_module_intf_names() - istream_name = io_names["s_axis"][0][0] - ostream_name = io_names["m_axis"][0][0] - io_dict = { - "inputs": {istream_name: inp}, - "outputs": {ostream_name: []}, - } - - trace_file = self.get_nodeattr("rtlsim_trace") - if trace_file == "default": - trace_file = self.onnx_node.name + ".vcd" - sname = "_" - - # Change into so directory to ensure threshold files can be found - rtlsim_so = self.get_nodeattr("rtlsim_so") - so_dir = os.path.dirname(os.path.realpath(rtlsim_so)) - olcwd = os.getcwd() - os.chdir(so_dir) - num_out_values = self.get_number_output_values() - reset_rtlsim(sim) - total_cycle_count = rtlsim_multi_io( - sim, - io_dict, - num_out_values, - trace_file=trace_file, - sname=sname, - liveness_threshold=pyverilate_get_liveness_threshold_cycles(), - ) - self.set_nodeattr("cycles_rtlsim", total_cycle_count) - os.chdir(olcwd) - output = io_dict["outputs"][ostream_name] - - # Manage output data - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - return def code_generation_ipi(self): """Constructs and returns the TCL commands for node instantiation as an RTL diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py index 96deb49161..264de25749 100644 --- a/src/finn/custom_op/fpgadataflow/rtlbackend.py +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -54,7 +54,7 @@ def code_generation_ipi(self): pass def code_generation_ipgen(self, model, fpgapart, clk): - self.generate_hdl() + self.generate_hdl(model) # TODO: Implement alternative def hls_sname(self): diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 899773b680..ecf4384d34 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -143,8 +143,6 @@ def make_single_thresholding_modelwrapper( @pytest.mark.vivado @pytest.mark.slow def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem_mode): - if impl_style == "rtl" and exec_mode == "cppsim": - pytest.skip("rtl implstyle has no cppsim, skipping") if nf == -1: nf = ich pe = ich // nf From 3244048e423ef16f7df67c87bc4ebd53b7ccf8eb Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 5 Mar 2024 12:23:47 +0000 Subject: [PATCH 180/291] [Thresholding HLS] Clean up weightstream width functions --- .../fpgadataflow/hls/thresholding_hls.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 7b9809f495..9127261dd3 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -102,13 +102,6 @@ def lut_estimation(self): # total cost return comparator_cost + lutram_cost - def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if self.get_nodeattr("mem_mode") == "decoupled": - return super().get_weightstream_width() - else: - return 0 - def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required by the AXI Stream spec. Used in decoupled mode.""" @@ -116,9 +109,11 @@ def get_weightstream_width_padded(self): return roundup_to_integer_multiple(weight_width, 8) def get_ap_int_max_w(self): - temp_value = super().get_ap_int_max_w() - weightstream = self.get_weightstream_width() - return max([weightstream, temp_value]) + ap_int_max_w = super().get_ap_int_max_w() + if self.get_nodeattr("mem_mode") == "decoupled": + weightstream = self.get_weightstream_width() + ap_int_max_w = max([weightstream, ap_int_max_w]) + return ap_int_max_w def get_template_param_values(self): """Returns the template parameter values according to input, output and weight From bf17bc3ec184403a47f70c5f19f0a6edb857c6c3 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 5 Mar 2024 14:11:08 +0000 Subject: [PATCH 181/291] [Threshold RTL] Remove unused generate params fct --- .../fpgadataflow/rtl/thresholding_rtl.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index c39ae74a38..f9acece073 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -514,28 +514,6 @@ def get_dynamic_config(self, weights, address_stride=1): return config - def generate_params(self, model, path): - code_gen_dir = path - thresholds = model.get_initializer(self.onnx_node.input[1]) - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - # save thresholds in thresh.h - weight_filename = "{}/thresh.h".format(code_gen_dir) - self.make_weight_file(thresholds, "hls_header", weight_filename) - elif mem_mode == "decoupled": - # save decoupled weights for cppsim - weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) - self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim) - # also save weights as Verilog .dat file - # This file will be ignored when synthesizing UltraScale memory. - weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) - self.make_weight_file(thresholds, "decoupled_verilog_dat", weight_filename_rtl) - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", - currently no other parameter value is supported!""" - ) - def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights (thresholds) in appropriate format for this layer. This file can be used for either synthesis or From 8a6832716e11ac25ce78e7644cab1f94bcef4729 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 5 Mar 2024 14:34:06 +0000 Subject: [PATCH 182/291] [Thresholding] Code clean for generation of hw compatible tensor --- .../fpgadataflow/hls/thresholding_hls.py | 53 ++----------------- 1 file changed, 3 insertions(+), 50 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 9127261dd3..cedddf5dd5 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -31,10 +31,7 @@ import textwrap from math import ceil, log2 from qonnx.core.datatype import DataType -from qonnx.util.basic import ( - interleave_matrix_outer_dim_from_partitions, - roundup_to_integer_multiple, -) +from qonnx.util.basic import roundup_to_integer_multiple from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.custom_op.fpgadataflow.thresholding import Thresholding @@ -109,7 +106,7 @@ def get_weightstream_width_padded(self): return roundup_to_integer_multiple(weight_width, 8) def get_ap_int_max_w(self): - ap_int_max_w = super().get_ap_int_max_w() + ap_int_max_w = HLSBackend.get_ap_int_max_w(self) if self.get_nodeattr("mem_mode") == "decoupled": weightstream = self.get_weightstream_width() ap_int_max_w = max([weightstream, ap_int_max_w]) @@ -128,50 +125,6 @@ def get_template_param_values(self): return ret - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 - * for unsigned inputs, ensure thresholds are positive - * interleave rows between PEs - * reshape into (PE, TMEM, n_thres_steps) and return - """ - mh = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - tmem = mh // pe - assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." - assert ( - orig_thres_matrix.ndim == 2 - ), """Threshold matrix dimension is - not as expected (2).""" - n_thres_steps = orig_thres_matrix.shape[1] - assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" - if not self.get_input_datatype().signed(): - # ensure all thresholds are nonnegative - assert (orig_thres_matrix >= 0).all() - # ensure all thresholds are integer - assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" - ret = orig_thres_matrix - # ensure channels = mh , duplicating if necessary - if ret.shape[0] == 1: - ret = np.tile(ret, (mh, 1)) - assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - assert ( - ret.shape[0] == pe - ), """First dimension after distribution of the - rows between PEs is not as expected (pe)""" - assert ( - ret.shape[1] == tmem - ), """Second dimension after distribution of the - rows between PEs is not as expected (tmem)""" - assert ( - ret.shape[2] == n_thres_steps - ), """Third dimension after distribution of the - rows between PEs is not as expected (n_thres_steps)""" - return ret.reshape(1, pe, tmem, n_thres_steps) - def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights (thresholds) in appropriate format for this layer. This file can be used for either synthesis or @@ -185,7 +138,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): * weight_file_name : filename for the weight file to be generated """ - threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) + threshold_tensor = self.get_hw_compatible_threshold_tensor(weights) tdt = self.get_weight_datatype() assert np.vectorize(tdt.allowed)( threshold_tensor From 4e244a7c466512b9f865cfc3c675b27e13f2c655 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 5 Mar 2024 14:50:47 +0000 Subject: [PATCH 183/291] [Tests] Add comment to params for thresholding test --- tests/fpgadataflow/test_fpgadataflow_thresholding.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index ecf4384d34..f8bc2df704 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -235,6 +235,7 @@ def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem @pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +# configuration (ch, pe) @pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 2), (8, 4)]) @pytest.mark.fpgadataflow @pytest.mark.vivado @@ -332,6 +333,7 @@ def read_weights(sim): @pytest.mark.parametrize("impl_style", ["hls", "rtl"]) +# configuration (ch, pe) @pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 2), (8, 4)]) @pytest.mark.fpgadataflow @pytest.mark.vivado From ac56bae4a56e9cd9282bc555ccdadb7557a9ea5f Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 5 Mar 2024 16:36:47 +0000 Subject: [PATCH 184/291] [NBs] Update folding notebook --- notebooks/advanced/3_folding.ipynb | 50 +++++++++++++------------ notebooks/advanced/cybsec_PE_SIMD.onnx | Bin 192234 -> 192077 bytes 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/notebooks/advanced/3_folding.ipynb b/notebooks/advanced/3_folding.ipynb index 07b66da52f..8c7b97d6c6 100644 --- a/notebooks/advanced/3_folding.ipynb +++ b/notebooks/advanced/3_folding.ipynb @@ -8,7 +8,7 @@ "--------------------------------------\n", "**Note: We will utilize one of the intermediate models generated in the process of the cybersecurity end2end example**\n", "\n", - "There is a local copy of `step_convert_to_hls.onnx` in this directory, which was renamed to `cybsec_PE_SIMD.onnx` to be able to go through this tutorial without requisites. But you can also generate it yourself with the [third cybersecurity Jupyter notebook](../end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb). After the execution of the estimates only build flow, it can be found in `../end2end_example/cybersecurity/output_estimates_only/intermediate_models/step_convert_to_hls.onnx`. \n", + "There is a local copy of `step_specialize_layers.onnx` in this directory, which was renamed to `cybsec_PE_SIMD.onnx` to be able to go through this tutorial without requisites. But you can also generate it yourself with the [third cybersecurity Jupyter notebook](../end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb). After the execution of the estimates only build flow, it can be found in `../end2end_example/cybersecurity/output_estimates_only/intermediate_models/step_specialize_layers.onnx`. \n", "\n", "This notebook describes the use of FINN parallelization parameters (PE & SIMD), also called folding factors, to efficiently optimize models so as to extract the maximum performance out of them. \n", "\n", @@ -41,7 +41,7 @@ "source": [ "This notebook shows the manual version of this step and explains how these attributes can improve performance and what are their effects on resource utilization for developers who need to maximize the performance of their network. \n", "\n", - "For that we will use the `cybsec_PE_SIMD.onnx` file as starting point. This intermediate model from the cybersecurity example is the model representation after the high-level ONNX layers are converted to HLS layers. Each node in the graph now corresponds to an HLS C++ function call and the parallelization parameters can be set using the node attributes.\n", + "For that we will use the `cybsec_PE_SIMD.onnx` file as starting point. This intermediate model from the cybersecurity example is the model representation after the high-level ONNX layers are converted to HW layers and then specialized to either HLS or RTL variants. In this example, all nodes were converted to HLS variants this means that each node in the graph now corresponds to an HLS C++ function call and the parallelization parameters can be set using the node attributes.\n", "\n", "We will take this model to show how to set the folding factors manually and analyze the estimated execution clock cycles and the resource utilization of each layer in the network." ] @@ -56,7 +56,7 @@ "\n", "![](finn-dataflow.png)\n", "\n", - "In practice, the layers are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library.\n", + "In practice, the layers are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library or by RTL modules from the [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib).\n", "\n", "Since each layer will be instantiated, we can flexibly set the parallelization of each layer and thus control resources and throughput of our network, as visualized in the image below:\n", "\n", @@ -72,11 +72,11 @@ "As discussed above, the network needs to go through a few preparation steps before it can be fed into our estimation functions.\n", "\n", "The `.onnx` file loaded here is taken from the cybersecurity end2end example notebook. \n", - "We pick the onnx file `cybsec_PE_SIMD.onnx` to which the necessary transformations have been applied for this notebook. This means, network layers mapped to necessary FINN-HLS blocks. In this case, the `MatrixVectorActivation` units. \n", + "We pick the onnx file `cybsec_PE_SIMD.onnx` to which the necessary transformations have been applied for this notebook. This means, network layers mapped to necessary FINN-HW blocks. In this case, the HLS variants of MatrixVectorActivation, `MVAU_hls` units. \n", "\n", "To interact with the `.onnx` file we use `ModelWrapper()`. This wrapper simplifies the access to different model attributes and allows us to apply custom transformations on the model.\n", "\n", - "In the below cell, we load our onnx file and view the cybersecurity MLP network in Netron." + "In the below cell, we load our onnx file and view the cybersecurity MLP network in Netron. Additionally, we call the transformation `GiveUniqueNodeNames` as a preparation." ] }, { @@ -87,8 +87,12 @@ "source": [ "import os\n", "from qonnx.core.modelwrapper import ModelWrapper\n", - "model_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD.onnx\" \n", - "model = ModelWrapper(model_path)\n", + "from qonnx.transformation.general import GiveUniqueNodeNames\n", + "\n", + "model = ModelWrapper(os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD.onnx\")\n", + "model = model.transform(GiveUniqueNodeNames())\n", + "model_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD_named_nodes.onnx\"\n", + "model.save(model_path)\n", "\n", "showInNetron(model_path)" ] @@ -106,7 +110,7 @@ "source": [ "The computational parallelism can be varied by setting the folding factors or also called parallelization parameters **PE** and **SIMD** of each layer. These parallelization attributes are subject to certain constraints and should be selected accordingly.\n", "\n", - "To see more details about how this is implemented in the `MatrixVectorActivation` layer (MVAU), please have a look at [this documentation](https://github.com/Xilinx/finn/blob/github-pages/docs/finn-sheduling-and-folding.pptx). A schematic of the folding in an MVAU for a fully-connected layer is shown below:\n", + "To see more details about how this is implemented in the HLS variant of the MatrixVectorActivation layer (`MVAU_hls`), please have a look at [this documentation](https://github.com/Xilinx/finn/blob/github-pages/docs/finn-sheduling-and-folding.pptx). A schematic of the folding in an MVAU for a fully-connected layer is shown below:\n", "\n", "![](finn-folding-mvau.png)" ] @@ -220,7 +224,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Next to the absolute numbers of LUTs, BRAM, URAM and DSPs, the analysis pass also provides information about the efficiency of the memory usage. If the memory type is not utilized, the efficiency is by default 1. You can see that above for the `URAM_efficiency`. In all other cases the efficiency indicates the actual parameter storage needed divided by the allocated BRAM/URAM storage. So, this means in our example MVAU_0 uses 5 block ram and they are 83% utilized. " + "Next to the absolute numbers of LUTs, BRAM, URAM and DSPs, the analysis pass also provides information about the efficiency of the memory usage. If the memory type is not utilized, the efficiency is by default 1. You can see that above for the `URAM_efficiency`. In all other cases the efficiency indicates the actual parameter storage needed divided by the allocated BRAM/URAM storage. So, this means in our example MVAU_hls_0 uses 5 block ram and they are 83% utilized. " ] }, { @@ -262,7 +266,7 @@ "## Modify Parameters\n", "\n", "We now modify the parallelization parameters of the first network layer to reduce its latency.\n", - "We only extract the first `MatrixVectorActivation` block from the model and set the parallelization parameters manually.\n", + "We only extract the first `MVAU_hls` block from the model and set the parallelization parameters manually.\n", "\n", "In the first step, we left the `PE` & `SIMD` values for all the layers on default (=1) to establish a baseline and measure the estimated clock cycles and resource utilization for each of the individual layers.\n", "\n", @@ -277,7 +281,7 @@ "source": [ "from qonnx.custom_op.registry import getCustomOp\n", "\n", - "list_of_mvaus = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "list_of_mvaus = model.get_nodes_by_op_type(\"MVAU_hls\")\n", "mvau0 = list_of_mvaus[0]\n", "\n", "mvau0_inst = getCustomOp(mvau0)\n", @@ -301,7 +305,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We save the model and view it. On expanding the first `MatrixVectorActivation` we can see the updated `PE` & `SIMD` parameters for that layer." + "We save the model and view it. On expanding the first `MVAU_hls` we can see the updated `PE` & `SIMD` parameters for that layer." ] }, { @@ -418,7 +422,7 @@ "outputs": [], "source": [ "dir_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/\" \n", - "model_orig = ModelWrapper(dir_path + \"cybsec_PE_SIMD.onnx\")\n", + "model_orig = ModelWrapper(dir_path + \"cybsec_PE_SIMD_named_nodes.onnx\")\n", "model_updated = ModelWrapper(\"cybsec_PE_SIMD_modified.onnx\")" ] }, @@ -436,7 +440,7 @@ "outputs": [], "source": [ "# Original model\n", - "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MVAU_hls\")\n", "print(\"In the original model (pe=simd=1): \")\n", "for mvau in list_of_mvaus:\n", " mvau_inst = getCustomOp(mvau)\n", @@ -452,7 +456,7 @@ "outputs": [], "source": [ "# Updated model\n", - "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MVAU_hls\")\n", "print(\"In the original model (pe=simd=1): \")\n", "for mvau in list_of_mvaus:\n", " mvau_inst = getCustomOp(mvau)\n", @@ -465,7 +469,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that the input and output shape for MatrixVectorActivation_0 has changed after we have changed the folding factors. These changes have direct influence on the in/out stream width. We can have a closer look at the formula to calculate the stream width of an MVAU." + "We can see that the input and output shape for MVAU_hls_0 has changed after we have changed the folding factors. These changes have direct influence on the in/out stream width. We can have a closer look at the formula to calculate the stream width of an MVAU." ] }, { @@ -507,7 +511,7 @@ "outputs": [], "source": [ "# Original model\n", - "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MVAU_hls\")\n", "print(\"In the original model (pe=simd=1): \")\n", "for mvau in list_of_mvaus:\n", " mvau_inst = getCustomOp(mvau)\n", @@ -537,7 +541,7 @@ "outputs": [], "source": [ "# Updated model\n", - "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MVAU_hls\")\n", "print(\"In the original model (pe=simd=1): \")\n", "for mvau in list_of_mvaus:\n", " mvau_inst = getCustomOp(mvau)\n", @@ -550,7 +554,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As we can see, the output stream width of MatrixVectorActivation_0 has now changed to `4`, while the input stream width of MatrixVectorActivation_1 stayed `2`. So, the FINN compiler would insert a DWC between these nodes, we can manually invoke this behavior by calling the transformation `InsertDWC` on our model." + "As we can see, the output stream width of MVAU_hls_0 has now changed to `4`, while the input stream width of MatrixVectorActivation_1 stayed `2`. So, the FINN compiler would insert a DWC between these nodes, we can manually invoke this behavior by first calling the transformation `InsertDWC` and then converting the resulting DWCs into an HLS or RTL variant by calling `SpecializeLayers`." ] }, { @@ -560,9 +564,10 @@ "outputs": [], "source": [ "from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n", - "from qonnx.transformation.general import GiveUniqueNodeNames\n", + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", "\n", "model_updated = model_updated.transform(InsertDWC())\n", + "model_updated = model_updated.transform(SpecializeLayers())\n", "model_updated = model_updated.transform(GiveUniqueNodeNames())" ] }, @@ -610,7 +615,6 @@ "source": [ "layers = res_dict_dwc.keys()\n", "# replace names of layers with abbreviations\n", - "layers = [n.replace(\"MatrixVectorActivation_\", \"MVU\") for n in layers]\n", "layers = [n.replace(\"StreamingDataWidthConverter_Batch\", \"DWC\") for n in layers]" ] }, @@ -656,9 +660,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/advanced/cybsec_PE_SIMD.onnx b/notebooks/advanced/cybsec_PE_SIMD.onnx index b450cc9e43361e845fda8c95d743e1b461a1a9ad..d09d07d2bf1b502d93bc676c8901fdc29de51d6b 100644 GIT binary patch delta 241 zcmaELm;3A;Zb=SyA@+jGlKi6N3@J84JwszXL#w%WxVf4qiurOWarlNghQ?>)6i@!g zEIo0RM42vNWeQNTQX;%g@H$=%G3*p)%dX`Zac z>W8ENXqDLHJ*?j#M)84+65<3qFg`1@`953weKyAJ_t}`3pHA<3!{jpk(QBp^(|5dO ma-Dwe4b!XXf8R1)5aDwYcVni6C$Bf4i zBO(l8;abkjHMx*kZ1PnWGu-Ym#$^=HTVj*7S-(M?Bq$^R4lUiR%o5$yih}&2W^cB3 zZ#Ks5-fT?FPp40M&9q{=-)kn9>Gf}zUQM@p!{j30Y(5_RgC8V From 4a3eedade41a061aa6d7c97795ffb2ce70535c0b Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Wed, 6 Mar 2024 10:33:25 +0000 Subject: [PATCH 185/291] [Thresholding RTL] Add doc strings to class methods --- .../fpgadataflow/rtl/thresholding_rtl.py | 64 ++++++------------- 1 file changed, 18 insertions(+), 46 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index f9acece073..ee101b1cc8 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -80,6 +80,20 @@ def get_nodeattr_types(self): return my_attrs def get_pe_mem_geometries(self): + ''' return a list of (bitwidth, depth) for PE memory configurations to be used in resource estimation + + for each bitwidth, the depth is calculated as the + number of thresholds that can be stored in a single + memory block + the bitwidth is the bitwidth of the threshold values + the depth is the number of thresholds that can be stored + in a single memory block + the number of memory blocks is calculated as the number + of thresholds divided by the depth + the number of memory blocks is then multiplied by the + number of PEs to get the total number of memory blocks + required for the entire layer + ''' pe = self.get_nodeattr("PE") wdt = self.get_weight_datatype() wdt_bits = wdt.bitwidth() @@ -95,6 +109,7 @@ def get_pe_mem_geometries(self): return ret def get_memory_estimate(self): + ''' return the memory estimate for this node ''' res_dict = {} depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") @@ -116,14 +131,17 @@ def get_memory_estimate(self): return res_dict def bram_estimation(self): + ''' return the number of BRAMs required for this node ''' res_dict = self.get_memory_estimate() return res_dict.get("BRAM", 0) def uram_estimation(self): + ''' return the number of URAMs required for this node ''' res_dict = self.get_memory_estimate() return res_dict.get("URAM", 0) def lut_estimation(self): + ''' return the number of LUTs required for this node ''' res_dict = self.get_memory_estimate() return res_dict.get("LUTRAM", 0) @@ -468,52 +486,6 @@ def get_verilog_top_module_intf_names(self): return intf_names - def get_dynamic_config(self, weights, address_stride=1): - """Returns a configuration dictionary containing axilite write commands - in order to program the thresholds into the RTL core during runtime. - The default address stride for the weights is 1 byte.""" - - # thresholds = model.get_initializer(self.onnx_node.input[1]) - thresholds = weights - num_channels, num_weights_per_channel = thresholds.shape - - weight_addr_boundary = find_next_power_of_2(num_weights_per_channel) - # Make sure that the next power of 2 (output) is greater than the input - assert weight_addr_boundary >= num_weights_per_channel - - config = {} - channel_cntr = 0 - wdt = self.get_weight_datatype() - bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 4) - for channel in thresholds: - channel_start_addr = channel_cntr * weight_addr_boundary * address_stride - weight_cntr = 0 - addr = 0 - for weight in channel: - key_name = "{}_{}{}_{}{}".format( - "axilite", "ch", str(channel_cntr), "w", str(weight_cntr) - ) - config[key_name] = ( - channel_start_addr + addr, - int( - str( - pack_innermost_dim_as_hex_string( - [weight], - wdt, - bw_hexdigit, - ) - ), - 0, - ), - ) - - weight_cntr += 1 - addr += address_stride - - channel_cntr += 1 - - return config - def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights (thresholds) in appropriate format for this layer. This file can be used for either synthesis or From f759400095adff49b2b715d0490e3cb657436852 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Wed, 6 Mar 2024 10:57:38 +0000 Subject: [PATCH 186/291] [tests] functional validation thresholding to_hw transform Signed-off-by: aziz bahri --- .../test_convert_to_hw_thresholding.py | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/tests/fpgadataflow/test_convert_to_hw_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py index ee161a9b95..9d44702152 100755 --- a/tests/fpgadataflow/test_convert_to_hw_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hw_thresholding.py @@ -36,9 +36,11 @@ from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes - +from qonnx.util.basic import gen_finn_dt_tensor +from qonnx.custom_op.general.multithreshold import multithreshold from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +import finn.core.onnx_exec as oxe test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -48,6 +50,18 @@ def sort_thresholds_increasing(thresholds): return np.sort(thresholds, axis=1) +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NHWC(FINN) to NCHW(Standard) +def layout_FINN2NCHW(data): + return np.transpose(data, (0, 3, 1, 2)) + +# Convert from NCHW(Standard) to NHWC(FINN) +def layout_NCHW2FINN(data): + return np.transpose(data, (0, 2, 3, 1)) def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): return np.random.randint( @@ -164,6 +178,27 @@ def test_convert_multithreshold_to_hardware( model = model.transform(InferThresholdingLayer()) + # Perform functional validation of the InferThresholdingLayer transform + x = gen_finn_dt_tensor(input_data_type, tuple(num_input_vecs + [num_input_channels])) + + x_nchw = layout_FINN2NCHW(x) + y_expected = multithreshold(x_nchw, thresholds) + + # convert back to NHWC for comparison to hw outputs + y_expected = layout_NCHW2FINN(y_expected) + if activation == DataType["BIPOLAR"]: + # binary to bipolar + y_expected = 2 * y_expected - 1 + else: + # signed offset + y_expected += activation.min() + + input_dict = prepare_inputs(x) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + assert (y_produced == y_expected).all() + + # Transform to the specified implementation style, either the RTL or HLS according to test parameters node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] inst = getCustomOp(node) inst.set_nodeattr("preferred_impl_style", impl_style) From 06607d52740716299fecb8583a83bf7ecf0b62c0 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 6 Mar 2024 13:50:46 +0000 Subject: [PATCH 187/291] [mem mode] Refactor mem_mode argument --- src/finn/builder/build_dataflow_config.py | 12 ---- src/finn/builder/build_dataflow_steps.py | 5 +- .../custom_op/fpgadataflow/hls/lookup_hls.py | 16 ++--- .../hls/matrixvectoractivation_hls.py | 42 ++++++------- .../fpgadataflow/hls/thresholding_hls.py | 59 ++++++++++--------- .../hls/vectorvectoractivation_hls.py | 34 +++++------ src/finn/custom_op/fpgadataflow/lookup.py | 10 ++-- .../fpgadataflow/matrixvectoractivation.py | 55 +++++++++-------- .../fpgadataflow/vectorvectoractivation.py | 53 +++++++++-------- .../fpgadataflow/convert_to_hw_layers.py | 14 +---- .../fpgadataflow/make_zynq_proj.py | 2 +- .../fpgadataflow/set_fifo_depths.py | 5 +- tests/end2end/test_end2end_bnn_pynq.py | 4 +- tests/end2end/test_end2end_mobilenet_v1.py | 2 +- .../test_convert_to_hw_layers_cnv.py | 2 +- tests/fpgadataflow/test_fifosizing.py | 1 - .../test_fpgadataflow_checksum.py | 13 ++-- .../test_fpgadataflow_ipstitch.py | 14 ++--- tests/fpgadataflow/test_fpgadataflow_mvau.py | 16 ++--- .../test_fpgadataflow_thresholding.py | 14 +++-- tests/fpgadataflow/test_fpgadataflow_vvau.py | 4 +- tests/fpgadataflow/test_runtime_weights.py | 2 +- tests/fpgadataflow/test_split_large_fifos.py | 1 - 23 files changed, 188 insertions(+), 192 deletions(-) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index 4cbcfb21c3..e35c1cd346 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -65,15 +65,6 @@ class DataflowOutputType(str, Enum): DEPLOYMENT_PACKAGE = "deployment_package" -class ComputeEngineMemMode(str, Enum): - """Memory mode for generated compute engines. See - https://finn.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode - for more information.""" - - CONST = "const" - DECOUPLED = "decoupled" - - class VitisOptStrategyCfg(str, Enum): """Vitis optimization strategy with serializable string enum values.""" @@ -293,9 +284,6 @@ class DataflowBuildConfig: #: If not specified it will default to synth_clk_period_ns hls_clk_period_ns: Optional[float] = None - #: Which memory mode will be used for compute layers - default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED - #: Call CapConvolutionFIFODepths in InsertAndSetFIFODepths transform #: to make convolution FIFOs smaller where appropriate default_swg_exception: Optional[bool] = False diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index a75bbe98a1..f935d5c53e 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -337,14 +337,13 @@ def step_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): In the end am empty json file is created which can be used to set user specific preferred implementation styles for each node.""" - mem_mode = cfg.default_mem_mode.value if cfg.standalone_thresholds: # doing this first causes all threshold layers to be standalone model = model.transform(to_hw.InferThresholdingLayer()) # needed for bipolar MatMul layers - model = model.transform(to_hw.InferBinaryMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) # needed for non-bipolar MatMul layers - model = model.transform(to_hw.InferQuantizedMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) # TopK to LabelSelect model = model.transform(to_hw.InferLabelSelectLayer()) # input quantization (if any) as standalone threshold diff --git a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py index feeca8719b..ba44deb898 100644 --- a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py @@ -57,7 +57,7 @@ def global_includes(self): mem_mode = self.get_nodeattr("mem_mode") global_incls = [] global_incls.append('#include "lookup.hpp"') - if mem_mode == "const": + if mem_mode == "internal_embedded": global_incls.append('#include "embeddings.hpp"') self.code_gen_dict["$GLOBALS$"] = global_incls @@ -80,7 +80,7 @@ def defines(self, var): my_defines.append("#define EmbeddingAlign %d" % ext_mem_emb_align) my_defines.append("#define T_SRC %s" % elem_hls_type) my_defines.append("#define T_DST ap_uint") - elif mem_mode == "const": + elif mem_mode == "internal_embedded": my_defines.append("#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings")) my_defines.append("#define EmbeddingDim %d" % emb_dim) my_defines.append("#define InputType %s" % elem_hls_type) @@ -143,7 +143,7 @@ def dataoutstrm(self): def docompute(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": self.code_gen_dict["$DOCOMPUTE$"] = [ """StreamingLookup(in0_%s, out_%s, embeddings);""" @@ -162,7 +162,7 @@ def blackboxfunction(self): packed_input_hls_type = "ap_uint<%d>" % ibits obits = self.get_outstream_width() packed_output_hls_type = "ap_uint<%d>" % obits - if mem_mode == "const": + if mem_mode == "internal_embedded": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" % ( @@ -188,7 +188,7 @@ def pragmas(self): my_pragmas = ["#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()] my_pragmas.append("#pragma HLS INTERFACE axis port=out_" + self.hls_sname()) my_pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return") - if mem_mode == "const": + if mem_mode == "internal_embedded": my_pragmas.append("#pragma HLS BIND_STORAGE variable=embeddings type=ROM_2P impl=BRAM") elif mem_mode == "external": my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem") @@ -203,7 +203,7 @@ def pragmas(self): def generate_params(self, model, path): mem_mode = self.get_nodeattr("mem_mode") embeddings = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "const": + if mem_mode == "internal_embedded": code_gen_dir = path weight_filename = "{}/embeddings.hpp".format(code_gen_dir) edt = DataType[self.get_nodeattr("EmbeddingType")] @@ -257,8 +257,8 @@ def execute_node(self, context, graph): folded_oshape = tuple(self.get_folded_output_shape()) mem_mode = self.get_nodeattr("mem_mode") assert ( - mem_mode == "const" - ), "Only mem_mode=const is supported for simulation of Lookup layer" + mem_mode == "internal_embedded" + ), "Only mem_mode=internal_embedded is supported for simulation of Lookup layer" if mode == "cppsim": code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index e279d3953a..8c640f6534 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -78,8 +78,8 @@ def lut_estimation(self): c2 = 0 mmode = self.get_nodeattr("mem_mode") mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 + if (mmode == "internal_decoupled" and mstyle == "distributed") or ( + mmode == "internal_embedded" and self.calc_wmem() <= 128 ): c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) @@ -178,7 +178,7 @@ def get_verilog_top_module_intf_names(self): sname = self.hls_sname() if mem_mode == "external": intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # only expose axilite interface if attribute is set runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if runtime_writable: @@ -190,9 +190,9 @@ def global_includes(self): self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: + if mem_mode not in ["internal_embedded", "internal_decoupled", "external"]: raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", currently no other parameter value is supported!""" ) self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] @@ -228,7 +228,7 @@ def defines(self, var): numReps, ) ] - if mem_mode == "decoupled" or mem_mode == "external": + if mem_mode == "internal_decoupled" or mem_mode == "external": wdt = self.get_weight_datatype() self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) @@ -259,7 +259,7 @@ def read_npy_data(self): ) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": + if mem_mode == "internal_decoupled" or mem_mode == "external": wdt = self.get_weight_datatype() elem_bits = wdt.bitwidth() packed_bits = self.get_weightstream_width() @@ -294,7 +294,7 @@ def strm_decl(self): ) ) - if mem_mode == "decoupled" or mem_mode == "external": + if mem_mode == "internal_decoupled" or mem_mode == "external": self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream> weights_{} ("weights_{}");'.format( self.get_weightstream_width(), self.hls_sname(), self.hls_sname() @@ -314,7 +314,7 @@ def docompute(self): threshs = "PassThroughActivation<%s>()" % odtype_hls_str else: threshs = "threshs" - if mem_mode == "const": + if mem_mode == "internal_embedded": self.code_gen_dict["$DOCOMPUTE$"] = [ """Matrix_Vector_Activate_Batch (in0_{}, out_{}, weights, {}, numReps, {});""".format( @@ -327,7 +327,7 @@ def docompute(self): map_to_hls_mult_style[self.get_nodeattr("resType")], ) ] - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": wdt = self.get_weight_datatype() if wdt == DataType["BIPOLAR"]: export_wdt = DataType["BINARY"] @@ -351,7 +351,7 @@ def docompute(self): else: raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", currently no other parameter value is supported!""" ) @@ -389,7 +389,7 @@ def save_as_npy(self): def blackboxfunction(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &out_{} @@ -401,7 +401,7 @@ def blackboxfunction(self): self.hls_sname(), ) ] - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}( hls::stream> &in0_{}, @@ -420,8 +420,8 @@ def blackboxfunction(self): else: raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" + """Please set mem_mode to "internal_embedded" or "internal_decoupled", + currently no other parameter value is supported!""" ) def pragmas(self): @@ -435,21 +435,21 @@ def pragmas(self): ) self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - if mem_mode == "const": + if mem_mode == "internal_embedded": self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') # the weight tensor is ap_uint [PE][WMEM] # partition for parallel access along the PE dimension (dim 1) self.code_gen_dict["$PRAGMAS$"].append( ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") ) - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() ) else: raise Exception( - """Please set mem_mode to "const", "decoupled", or external, + """Please set mem_mode to "internal_embedded", "internal_decoupled", or external, currently no other parameter value is supported!""" ) @@ -482,7 +482,7 @@ def pragmas(self): def get_ap_int_max_w(self): # base class impl (max of inp/out stream widths) max_of_io = super().get_ap_int_max_w() - # decoupled mode weight stream + # internal_decoupled mode weight stream weightstream = self.get_weightstream_width() # single PE weight entry weight_bits = self.get_weight_datatype().bitwidth() @@ -556,7 +556,7 @@ def execute_node(self, context, graph): inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) self.reset_rtlsim(sim) self.toggle_clk(sim) - if mem_mode == "external" or mem_mode == "decoupled": + if mem_mode == "external" or mem_mode == "internal_decoupled": wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() # we have converted bipolar weights to binary for export, @@ -597,7 +597,7 @@ def instantiate_ip(self, cmd): # instantiate the HLS IP vlnv = self.get_nodeattr("ip_vlnv") node_name = self.onnx_node.name - if self.get_nodeattr("mem_mode") == "decoupled": + if self.get_nodeattr("mem_mode") == "internal_decoupled": cmd.append("create_bd_cell -type ip -vlnv %s /%s/%s" % (vlnv, node_name, node_name)) else: cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, node_name)) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index fb90365eef..5fb1843270 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -63,10 +63,15 @@ def get_nodeattr_types(self): # string defining memory type "ram_style": ("s", False, "distributed", {"distributed", "block"}), # memory mode for the thresholds - # const -- embedded thresholds, default - # decoupled -- streaming thresholds with streamer packaged inside IP - "mem_mode": ("s", False, "const", {"const", "decoupled"}), - # (mem_mode = decoupled only) whether weights (thresholds) will be + # internal_embedded -- embedded thresholds + # internal_decoupled -- default, streaming thresholds with streamer packaged inside IP + "mem_mode": ( + "s", + False, + "internal_decoupled", + {"internal_embedded", "internal_decoupled"}, + ), + # (mem_mode = internal_decoupled only) whether weights (thresholds) will be # writable through an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. # see finn-rtllib/memstream/doc/README for more about the memory @@ -119,8 +124,8 @@ def lut_estimation(self): return comparator_cost + lutram_cost def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if self.get_nodeattr("mem_mode") == "decoupled": + """Returns weight stream width. Used only in internal_decoupled mode.""" + if self.get_nodeattr("mem_mode") == "internal_decoupled": pe = self.get_nodeattr("PE") wp = self.get_weight_datatype().bitwidth() n_thres_steps = self.get_nodeattr("numSteps") @@ -131,7 +136,7 @@ def get_weightstream_width(self): def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" + by the AXI Stream spec. Used in internal_decoupled mode.""" weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8) @@ -304,12 +309,12 @@ def generate_params(self, model, path): code_gen_dir = path thresholds = model.get_initializer(self.onnx_node.input[1]) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": # save thresholds in thresh.h weight_filename = "{}/thresh.h".format(code_gen_dir) self.make_weight_file(thresholds, "hls_header", weight_filename) - elif mem_mode == "decoupled": - # save decoupled weights for cppsim + elif mem_mode == "internal_decoupled": + # save internal_decoupled weights for cppsim weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim) # also save weights as Verilog .dat file @@ -383,7 +388,7 @@ def execute_node(self, context, graph): inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) super().reset_rtlsim(sim) super().toggle_clk(sim) - if self.get_nodeattr("mem_mode") == "decoupled": + if self.get_nodeattr("mem_mode") == "internal_decoupled": wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() wei = npy_to_rtlsim_input( @@ -396,7 +401,7 @@ def execute_node(self, context, graph): } self.rtlsim_multi_io(sim, io_dict) output = io_dict["outputs"]["out"] - elif self.get_nodeattr("mem_mode") == "const": + elif self.get_nodeattr("mem_mode") == "internal_embedded": output = self.rtlsim(sim, inp) else: raise Exception("Unrecognized mem_mode") @@ -422,7 +427,7 @@ def execute_node(self, context, graph): def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] - if self.get_nodeattr("mem_mode") == "const": + if self.get_nodeattr("mem_mode") == "internal_embedded": self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] # TODO check and add whatever missing @@ -440,7 +445,7 @@ def defines(self, var): total_spatial_size, ) ] - if self.get_nodeattr("mem_mode") == "decoupled": + if self.get_nodeattr("mem_mode") == "internal_decoupled": self.code_gen_dict["$DEFINES$"].append( "#define ActVal1 %d" % self.get_nodeattr("ActVal") ) @@ -474,7 +479,7 @@ def read_npy_data(self): ) ) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": tdt = self.get_weight_datatype() elem_bits = tdt.bitwidth() packed_bits = self.get_weightstream_width() @@ -508,7 +513,7 @@ def strm_decl(self): ) ) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream> weights_{} ("weights_{}");'.format( self.get_weightstream_width(), self.hls_sname(), self.hls_sname() @@ -518,7 +523,7 @@ def strm_decl(self): def docompute(self): tmpl_args = self.get_template_param_values() mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": self.code_gen_dict["$DOCOMPUTE$"] = [ """Thresholding_Batch (in0_{}, out_{}, threshs, numReps);""".format( @@ -528,7 +533,7 @@ def docompute(self): self.hls_sname(), ) ] - elif mem_mode == "decoupled": + elif mem_mode == "internal_decoupled": # note that numReps is set to 1 in the invocation below, since # - for cppsim the repetition comes from the threshold stream reader+input # - for synth the unit runs continuously anyway (ap_ctrl_none) @@ -576,7 +581,7 @@ def dataoutstrm(self): ] def blackboxfunction(self): - if self.get_nodeattr("mem_mode") == "const": + if self.get_nodeattr("mem_mode") == "internal_embedded": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &out_{} @@ -588,7 +593,7 @@ def blackboxfunction(self): self.hls_sname(), ) ] - elif self.get_nodeattr("mem_mode") == "decoupled": + elif self.get_nodeattr("mem_mode") == "internal_decoupled": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &weights_{}, @@ -615,7 +620,7 @@ def pragmas(self): ) self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - if self.get_nodeattr("mem_mode") == "const": + if self.get_nodeattr("mem_mode") == "internal_embedded": # the threshold tensor is acc_type [PE][TMEM][N_THRES] # partition for parallel access along PE and N_THRES # dimensions (dims 1 and 3) @@ -647,7 +652,7 @@ def pragmas(self): ram_style ) ) - elif self.get_nodeattr("mem_mode") == "decoupled": + elif self.get_nodeattr("mem_mode") == "internal_decoupled": self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() ) @@ -656,7 +661,7 @@ def code_generation_ipi(self): cmd = [] # add streamer if needed mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": node_name = self.onnx_node.name runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 sname = self.hls_sname() @@ -749,8 +754,8 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const": - # base class impl sufficient for const mode + elif mem_mode == "internal_embedded": + # base class impl sufficient for internal_embedded mode return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for Thresholding_Batch") @@ -759,7 +764,7 @@ def code_generation_ipi(self): def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # only expose axilite interface if attribute is set runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if runtime_writable: @@ -791,7 +796,7 @@ def derive_characteristic_fxns(self, period): "outputs": {"out": []}, } mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: + if mem_mode in ["internal_decoupled", "external"]: n_weight_inps = self.calc_tmem() num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py index 7e475ff67f..c7f0576495 100644 --- a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -114,7 +114,7 @@ def execute_node(self, context, graph): super().reset_rtlsim(sim) super().toggle_clk(sim) - if mem_mode == "external" or mem_mode == "decoupled": + if mem_mode == "external" or mem_mode == "internal_decoupled": wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() # we have converted bipolar weights to binary for export, @@ -196,9 +196,9 @@ def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: + if mem_mode not in ["internal_embedded", "internal_decoupled", "external"]: raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", currently no other parameter value is supported!""" ) if self.calc_tmem() != 0: @@ -221,7 +221,7 @@ def defines(self, var): numReps, ) ] - if mem_mode == "decoupled" or mem_mode == "external": + if mem_mode == "internal_decoupled" or mem_mode == "external": wdt = self.get_weight_datatype() self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) @@ -252,7 +252,7 @@ def read_npy_data(self): ) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": + if mem_mode == "internal_decoupled" or mem_mode == "external": wdt = self.get_weight_datatype() elem_bits = wdt.bitwidth() packed_bits = self.get_weightstream_width() @@ -286,7 +286,7 @@ def strm_decl(self): self.get_outstream_width(), self.hls_sname(), self.hls_sname() ) ) - if mem_mode == "decoupled" or mem_mode == "external": + if mem_mode == "internal_decoupled" or mem_mode == "external": self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream> weights_{} ("weights_{}");'.format( self.get_weightstream_width(), self.hls_sname(), self.hls_sname() @@ -307,7 +307,7 @@ def docompute(self): else: threshs = "threshs" - if mem_mode == "const": + if mem_mode == "internal_embedded": self.code_gen_dict["$DOCOMPUTE$"] = [ """Vector_Vector_Activate_Batch (in0_{}, out_{}, weights, {}, numReps, {});""".format( @@ -320,7 +320,7 @@ def docompute(self): map_to_hls_mult_style[self.get_nodeattr("resType")], ) ] - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": wdt = self.get_weight_datatype() if wdt == DataType["BIPOLAR"]: export_wdt = DataType["BINARY"] @@ -344,7 +344,7 @@ def docompute(self): ] else: raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", currently no other parameter value is supported!""" ) @@ -382,7 +382,7 @@ def save_as_npy(self): def blackboxfunction(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}(hls::stream> &in0_{}, hls::stream> &out_{} @@ -394,7 +394,7 @@ def blackboxfunction(self): self.hls_sname(), ) ] - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ """void {}( hls::stream> &in0_{}, @@ -412,8 +412,8 @@ def blackboxfunction(self): ] else: raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" + """Please set mem_mode to "internal_embedded" or "internal_decoupled", + currently no other parameter value is supported!""" ) def pragmas(self): @@ -426,20 +426,20 @@ def pragmas(self): ) self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - if mem_mode == "const": + if mem_mode == "internal_embedded": self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') # the weight tensor is ap_uint [PE][WMEM] # partition for parallel access along the PE dimension (dim 1) self.code_gen_dict["$PRAGMAS$"].append( ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") ) - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() ) else: raise Exception( - """Please set mem_mode to "const", "decoupled", or external, + """Please set mem_mode to "internal_embedded", "internal_decoupled", or external, currently no other parameter value is supported!""" ) @@ -458,7 +458,7 @@ def get_verilog_top_module_intf_names(self): sname = self.hls_sname() if mem_mode == "external": intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # only expose axilite interface if attribute is set runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if runtime_writable: diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py index 367bda1f07..ab6228a5d6 100644 --- a/src/finn/custom_op/fpgadataflow/lookup.py +++ b/src/finn/custom_op/fpgadataflow/lookup.py @@ -57,9 +57,9 @@ def get_nodeattr_types(self): # Input shape "InputShape": ("ints", False, [1]), # Memory mode - # const : parameters baked into bitfile (BRAM) + # internal_embedded : parameters baked into bitfile (BRAM) # external : lookup performed in external memory over AXI MM - "mem_mode": ("s", False, "const", ["const", "external"]), + "mem_mode": ("s", False, "internal_embedded", ["internal_embedded", "external"]), # Width for AXI-MM interface # only relevant when mem_mode="external" "ext_mem_width": ("i", False, 32), @@ -90,7 +90,7 @@ def get_folded_output_shape(self, ind=0): ishape = self.get_normal_input_shape() mem_mode = self.get_nodeattr("mem_mode") emb_dim = self.get_nodeattr("EmbeddingDim") - if mem_mode == "const": + if mem_mode == "internal_embedded": oshape = list(ishape) + [emb_dim] elif mem_mode == "external": ext_mem_width = self.get_nodeattr("ext_mem_width") @@ -187,9 +187,9 @@ def execute_node(self, context, graph): def bram_estimation(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": # current calculation assumes embeddings always stored in BRAM_18Ks - # when mem_mode is const + # when mem_mode is internal_embedded width_factor = ceil(self.get_outstream_width() / 16) depth_factor = ceil(self.get_nodeattr("NumEmbeddings") / 1024) return width_factor * depth_factor diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index ac173e4af6..a9f62077bd 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -82,11 +82,16 @@ def get_nodeattr_types(self): # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) "numInputVectors": ("ints", False, [1]), # memory mode for the FC weights - # const -- embedded weights, default, long compile/synth times - # decoupled -- streaming weights with weight streamer packaged inside IP + # internal_embedded -- embedded weights, long compile/synth times + # internal_decoupled -- default, streaming weights with streamer packaged inside IP # external -- streaming weights with external streamer - "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), - # FPGA resource type for memories in decoupled mode + "mem_mode": ( + "s", + False, + "internal_decoupled", + {"internal_embedded", "internal_decoupled", "external"}, + ), + # FPGA resource type for memories in internal_decoupled mode # auto -- let Vivado decide # block -- use BRAM # distributed -- use LUTRAM @@ -108,8 +113,8 @@ def get_nodeattr_types(self): "auto", {"auto", "block", "distributed"}, ), - # (mem_mode = decoupled only) whether weights will be writable through - # an AXI-lite interface during runtime + # (mem_mode = internal_decoupled only) whether weights will be + # writeable through an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. # see finn-rtllib/memstream/doc/README for more about the memory # address map used for writable weights @@ -265,9 +270,10 @@ def get_outstream_width(self, ind=0): return out_width def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" + """Returns weight stream width. + Used only in internal_decoupled and external mode.""" if ( - self.get_nodeattr("mem_mode") == "decoupled" + self.get_nodeattr("mem_mode") == "internal_decoupled" or self.get_nodeattr("mem_mode") == "external" ): pe = self.get_nodeattr("PE") @@ -280,7 +286,7 @@ def get_weightstream_width(self): def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" + by the AXI Stream spec. Used in internal_decoupled mode.""" weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8) @@ -360,8 +366,8 @@ def uram_estimation(self): mmode = self.get_nodeattr("mem_mode") mstyle = self.get_nodeattr("ram_style") if ( - (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const" and self.calc_wmem() <= 128) + (mmode == "internal_decoupled" and mstyle != "ultra") + or (mmode == "internal_embedded" and self.calc_wmem() <= 128) or (mmode == "external") ): return 0 @@ -389,13 +395,14 @@ def bram_estimation(self): mmode = self.get_nodeattr("mem_mode") mstyle = self.get_nodeattr("ram_style") if ( - (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mmode == "const" and self.calc_wmem() <= 128) + (mmode == "internal_decoupled" and mstyle in ["distributed", "ultra"]) + or (mmode == "internal_embedded" and self.calc_wmem() <= 128) or (mmode == "external") ): return 0 # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # assuming decoupled (RTL) memory, which is more efficient than const (HLS) + # assuming internal_decoupled (RTL) memory, + # which is more efficient than internal_embedded (HLS) if mem_width == 1: return math.ceil(omega / 16384) elif mem_width == 2: @@ -674,7 +681,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): f_weights.write(weight_hls_code) f_weights.close() elif "decoupled" in weight_file_mode: - # create a weight stream for various flavors of decoupled mode: + # create a weight stream for various flavors of internal_decoupled mode: # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) # reverse SIMD flip for saving weights in .npy @@ -739,22 +746,22 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "const": + if mem_mode == "internal_embedded": # save hlslib-compatible weights in params.h weight_filename = "{}/params.h".format(code_gen_dir) self.make_weight_file(weights, "hls_header", weight_filename) - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": weight_filename_sim = "{}/weights.npy".format(code_gen_dir) - # save decoupled weights for cppsim + # save internal_decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # also save weights as Verilog .dat file # This file will be ignored when synthesizing UltraScale memory. weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl) else: raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", currently no other parameter value is supported!""" ) @@ -840,7 +847,7 @@ def derive_characteristic_fxns(self, period): "outputs": {"out": []}, } mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: + if mem_mode in ["internal_decoupled", "external"]: n_weight_inps = self.calc_wmem() num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] @@ -850,7 +857,7 @@ def code_generation_ipi(self): cmd = [] # add streamer if needed mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if self.get_nodeattr("ram_style") == "ultra": assert ( @@ -945,8 +952,8 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes + elif mem_mode == "internal_embedded" or mem_mode == "external": + # base class impl sufficient for internal_embedded/external modes self.instantiate_ip(cmd) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index 79265f8daa..c5ec7e0648 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -67,11 +67,16 @@ def get_nodeattr_types(self): # no-activation mode (produce accumulators) "noActivation": ("i", False, 0, {0, 1}), # memory mode for the layer weights - # const -- embedded weights, default, long compile/synth times - # decoupled -- streaming weights with weight streamer packaged inside IP + # internal_embedded -- embedded weights, long compile/synth times + # internal_decoupled -- default, streaming weights with streamer packaged inside IP # external -- streaming weights with external streamer - "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), - # (mem_mode = decoupled only) whether weights will be writable through + "mem_mode": ( + "s", + False, + "internal_decoupled", + {"internal_embedded", "internal_decoupled", "external"}, + ), + # (mem_mode = internal_decoupled only) whether weights will be writable through # an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. # see finn-rtllib/memstream/doc/README for more about the memory @@ -81,7 +86,7 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), - # FPGA resource type for memories in decoupled mode + # FPGA resource type for memories in internal_decoupled mode # auto -- let Vivado decide # block -- use BRAM # distributed -- use LUTRAM @@ -200,9 +205,9 @@ def get_instream_width(self, ind=0): return in_width def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" + """Returns weight stream width. Used only in internal_decoupled mode.""" if ( - self.get_nodeattr("mem_mode") == "decoupled" + self.get_nodeattr("mem_mode") == "internal_decoupled" or self.get_nodeattr("mem_mode") == "external" ): simd = self.get_nodeattr("SIMD") @@ -220,7 +225,7 @@ def get_outstream_width(self, ind=0): def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" + by the AXI Stream spec. Used in internal_decoupled mode.""" weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8) @@ -300,8 +305,8 @@ def uram_estimation(self): mmode = self.get_nodeattr("mem_mode") mstyle = self.get_nodeattr("ram_style") if ( - (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const") + (mmode == "internal_decoupled" and mstyle != "ultra") + or (mmode == "internal_embedded") or (mmode == "external") ): return 0 @@ -324,9 +329,9 @@ def bram_estimation(self): mmode = self.get_nodeattr("mem_mode") mstyle = self.get_nodeattr("ram_style") if ( - (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) + (mmode == "internal_decoupled" and mstyle in ["distributed", "ultra"]) or (mstyle == "auto" and self.calc_wmem() <= 128) - or (mmode == "const" and self.calc_wmem() <= 128) + or (mmode == "internal_embedded" and self.calc_wmem() <= 128) or (mmode == "external") ): return 0 @@ -392,8 +397,8 @@ def lut_estimation(self): c2 = 0 mmode = self.get_nodeattr("mem_mode") mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 + if (mmode == "internal_decoupled" and mstyle == "distributed") or ( + mmode == "internal_embedded" and self.calc_wmem() <= 128 ): c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) @@ -679,7 +684,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): f_weights.write(weight_hls_code) f_weights.close() elif "decoupled" in weight_file_mode: - # create a weight stream for various flavors of decoupled mode: + # create a weight stream for various flavors of internal_decoupled mode: # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) # reverse SIMD flip for saving weights in .npy @@ -744,22 +749,22 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "const": + if mem_mode == "internal_embedded": # save hlslib-compatible weights in params.h weight_filename = "{}/params.h".format(code_gen_dir) self.make_weight_file(weights, "hls_header", weight_filename) - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": weight_filename_sim = "{}/weights.npy".format(code_gen_dir) - # save decoupled weights for cppsim + # save internal_decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # also save weights as Verilog .dat file # This file will be ignored when synthesizing UltraScale memory. weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl) else: raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", currently no other parameter value is supported!""" ) @@ -845,7 +850,7 @@ def derive_characteristic_fxns(self, period): "outputs": {"out": []}, } mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: + if mem_mode in ["internal_decoupled", "external"]: n_weight_inps = self.calc_wmem() num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] @@ -855,7 +860,7 @@ def code_generation_ipi(self): cmd = [] # add streamer if needed mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if self.get_nodeattr("ram_style") == "ultra": assert ( @@ -952,8 +957,8 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes + elif mem_mode == "internal_embedded" or mem_mode == "external": + # base class impl sufficient for internal_embedded/external modes return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for VectorVectorActivation") diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 014a5c82bd..fdb892e911 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1287,9 +1287,8 @@ class InferBinaryMatrixVectorActivation(Transformation): MatrixVectorActivation layers. Any immediately following MultiThreshold layers will also be absorbed into the MVTU.""" - def __init__(self, mem_mode="const"): + def __init__(self): super().__init__() - self.mem_mode = mem_mode def apply(self, model): graph = model.graph @@ -1372,7 +1371,6 @@ def apply(self, model): binaryXnorMode=1, noActivation=0, numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, name=n.name, ) graph.node.insert(node_ind, new_node) @@ -1403,7 +1401,6 @@ def apply(self, model): binaryXnorMode=1, noActivation=1, numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, name=n.name, ) graph.node.insert(node_ind, new_node) @@ -1420,9 +1417,8 @@ class InferQuantizedMatrixVectorActivation(Transformation): """Convert MatMul layers with quantized inputs and weights to MatrixVectorActivation layers.""" - def __init__(self, mem_mode="const"): + def __init__(self): super().__init__() - self.mem_mode = mem_mode def apply(self, model): graph = model.graph @@ -1509,7 +1505,6 @@ def apply(self, model): binaryXnorMode=0, noActivation=0, numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, name="MVAU_" + n.name, ) graph.node.insert(node_ind, new_node) @@ -1540,7 +1535,6 @@ def apply(self, model): binaryXnorMode=0, noActivation=1, numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, name="MVAU_" + n.name, ) graph.node.insert(node_ind, new_node) @@ -1560,9 +1554,8 @@ class InferVectorVectorActivation(Transformation): a depthwise convolution. Any immediately following MultiThreshold layers will also be absorbed into the VVAU.""" - def __init__(self, mem_mode="const"): + def __init__(self): super().__init__() - self.mem_mode = mem_mode def apply(self, model): graph = model.graph @@ -1659,7 +1652,6 @@ def apply(self, model): ActVal=actval, noActivation=0, name="VectorVectorActivation_" + n.name, - mem_mode=self.mem_mode, ) graph.node.insert(node_ind, new_node) # remove old nodes diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index 7e3754e41e..ade38ddfbf 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -65,7 +65,7 @@ def collect_ip_dirs(model, ipstitch_path): contain the generated ip blocks doesn't exist.""" ip_dirs += [ip_dir_value] if node.op_type.startswith("MVAU") or node.op_type.startswith("Thresholding"): - if node_inst.get_nodeattr("mem_mode") == "decoupled": + if node_inst.get_nodeattr("mem_mode") == "internal_decoupled": need_memstreamer = True ip_dirs += [ipstitch_path + "/ip"] if need_memstreamer: diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index d81f1fe247..e150e7a10b 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -290,10 +290,11 @@ def apply(self, model): mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) - node.set_nodeattr("mem_mode", "decoupled") + node.set_nodeattr("mem_mode", "internal_decoupled") reset_implementation(node) warnings.warn( - "Changed mem_mode from external to decoupled for " + node.onnx_node.name + "Changed mem_mode from external to internal_decoupled for " + + node.onnx_node.name ) # insert stream infrastructure (DWC/FIFO) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index bdede35244..0fab1b298e 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -108,7 +108,7 @@ build_dir = os.environ["FINN_BUILD_DIR"] target_clk_ns = 20 -mem_mode = "decoupled" +mem_mode = "internal_decoupled" rtlsim_trace = False @@ -134,7 +134,7 @@ def fold_tfc(model): inp_qnt_node = model.get_nodes_by_op_type("Thresholding_hls")[0] inp_qnt = getCustomOp(inp_qnt_node) inp_qnt.set_nodeattr("PE", 49) - inp_qnt.set_nodeattr("mem_mode", "decoupled") + inp_qnt.set_nodeattr("mem_mode", "internal_decoupled") inp_qnt.set_nodeattr("runtime_writeable_weights", 1) return model diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index 1fceda8141..abd019c7bc 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -92,7 +92,7 @@ test_platform = alveo_default_platform[test_board] test_fpga_part = alveo_part_map[test_board] target_clk_ns = 3 -mem_mode = "decoupled" +mem_mode = "internal_decoupled" large_fifo_ram_style = "ultra" extra_fold = 1 first_layer_res_type = "dsp" diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py index 64ccebf97a..96e945d083 100644 --- a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py @@ -113,7 +113,7 @@ def test_convert_to_hw_layers_cnv_w1a1(fused_activation): for node in model.graph.node: if node.op_type == "MVAU_hls": inst = getCustomOp(node) - inst.set_nodeattr("mem_mode", "decoupled") + inst.set_nodeattr("mem_mode", "internal_decoupled") mw = inst.get_nodeattr("MW") mh = inst.get_nodeattr("MH") if mh % 4 == 0: diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py index f3716dea9b..338204c0c7 100644 --- a/tests/fpgadataflow/test_fifosizing.py +++ b/tests/fpgadataflow/test_fifosizing.py @@ -76,7 +76,6 @@ def test_fifosizing_linear(method, topology): build_cfg.DataflowOutputType.STITCHED_IP, build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, ], - default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED, ) build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg) with open(tmp_output_dir + "/report/estimate_network_performance.json") as f: diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py index 81a4e3e33c..34a48996c9 100644 --- a/tests/fpgadataflow/test_fpgadataflow_checksum.py +++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py @@ -72,10 +72,10 @@ def create_two_fc_model(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MVAU", + "MVAU_hls", ["inp", "w0"], ["mid"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -87,14 +87,14 @@ def create_two_fc_model(): ActVal=actval, binaryXnorMode=binary_xnor_mode, noActivation=no_act, - mem_mode="decoupled", + mem_mode="internal_decoupled", ) fc1 = helper.make_node( - "MVAU", + "MVAU_hls", ["mid", "w1"], ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -106,7 +106,7 @@ def create_two_fc_model(): ActVal=actval, binaryXnorMode=binary_xnor_mode, noActivation=no_act, - mem_mode="decoupled", + mem_mode="internal_decoupled", ) graph = helper.make_graph( @@ -141,7 +141,6 @@ def test_fpgadataflow_checksum(): # use a graph consisting of two fc layers to test # checksum node insertion model = create_two_fc_model() - model = model.transform(SpecializeLayers()) # set checksum output hook for n in model.graph.node: diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index ab62b2d476..2061601b4a 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -62,7 +62,7 @@ ip_stitch_model_dir = os.environ["FINN_BUILD_DIR"] -def create_one_fc_model(mem_mode="const"): +def create_one_fc_model(mem_mode="internal_embedded"): # create a model with a MatrixVectorActivation instance with no activation # the wider range of the full accumulator makes debugging a bit easier wdt = DataType["INT2"] @@ -114,7 +114,7 @@ def create_one_fc_model(mem_mode="const"): return model -def create_two_fc_model(mem_mode="decoupled"): +def create_two_fc_model(mem_mode="internal_decoupled"): # create a model with two MatrixVectorActivation instances wdt = DataType["INT2"] idt = DataType["INT32"] @@ -195,7 +195,7 @@ def create_two_fc_model(mem_mode="decoupled"): return model -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_ipstitch_gen_model(mem_mode): @@ -214,7 +214,7 @@ def test_fpgadataflow_ipstitch_gen_model(mem_mode): model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model_%s.onnx" % mem_mode) -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_ipstitch_do_stitch(mem_mode): @@ -232,7 +232,7 @@ def test_fpgadataflow_ipstitch_do_stitch(mem_mode): model.save(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch_%s.onnx" % mem_mode) -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_ipstitch_rtlsim(mem_mode): @@ -281,7 +281,7 @@ def test_fpgadataflow_ipstitch_rtlsim(mem_mode): assert (rtlsim_res == x).all() -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow @@ -336,7 +336,7 @@ def test_fpgadataflow_ipstitch_vitis_end2end(board, period_ns, extw): pytest.skip("VITIS_PATH not set") platform = alveo_default_platform[board] fpga_part = alveo_part_map[board] - model = create_two_fc_model("external" if extw else "decoupled") + model = create_two_fc_model("external" if extw else "internal_decoupled") if model.graph.node[0].op_type == "StreamingDataflowPartition": sdp_node = getCustomOp(model.graph.node[0]) assert sdp_node.__class__.__name__ == "StreamingDataflowPartition" diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 216b0f2937..c4112acfa4 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -217,8 +217,8 @@ def test_fpgadataflow_mvau_hwop(idt, wdt, act, nf, sf, mw, mh): assert (y_produced == y_expected).all(), "cppsim hw-op failed" -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"]) +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled", "external"]) # activation: None or DataType @pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]]) # weight datatype @@ -310,8 +310,8 @@ def test_fpgadataflow_mvau_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert (y_produced == y_expected).all(), "cppsim hls-op failed" -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"]) +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled", "external"]) # activation: None or DataType @pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]]) # weight datatype @@ -411,8 +411,8 @@ def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert exp_cycles != 0 -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["decoupled"]) +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_decoupled"]) # activation: None or DataType @pytest.mark.parametrize("act", [DataType["INT4"]]) # weight datatype @@ -513,8 +513,8 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( assert exp_cycles != 0 -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["decoupled", "const"]) +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_decoupled", "internal_embedded"]) # activation: None or DataType @pytest.mark.parametrize("act", [DataType["INT4"]]) # weight datatype diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 43eca7b7c3..6cf7b4fd40 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -57,7 +57,7 @@ target_clk_ns = 5 -def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs): +def make_single_thresholding_modelwrapper(T, idt, odt, actval, n_inp_vecs): NumChannels = T.shape[0] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]) @@ -72,13 +72,11 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_i domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", NumChannels=NumChannels, - PE=pe, numSteps=T.shape[1], inputDataType=idt.name, weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth outputDataType=odt.name, ActVal=actval, - mem_mode=mem_mode, numInputVectors=n_inp_vecs, ) graph = helper.make_graph( @@ -110,7 +108,7 @@ def make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_i # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # memory mode -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow @@ -135,7 +133,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): else: actval = odt.min() - model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode, n_inp_vecs) + model = make_single_thresholding_modelwrapper(T, idt, odt, actval, n_inp_vecs) # calculate reference output # multithreshold util fxn wants NCHW input, not NHWC @@ -163,6 +161,10 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): assert (y_produced == y_expected).all() model = model.transform(SpecializeLayers()) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + inst.set_nodeattr("mem_mode", mem_mode) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -201,7 +203,7 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): @pytest.mark.vivado def test_runtime_thresholds_single_layer(): n_inp_vecs = [1, 2, 2] - mem_mode = "decoupled" + mem_mode = "internal_decoupled" act = DataType["INT4"] idt = DataType["INT16"] nf = 8 diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index d4fef6952d..eb521f965a 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -91,7 +91,7 @@ def _make_single_vvau_modelwrapper( odt, T=None, tdt=None, - mem_mode="const", + mem_mode="internal_embedded", ): in_shape = [1, dim_h, dim_w, k_h * k_w * channels] # [N, H, W, K*K*CH] out_shape = [ @@ -181,7 +181,7 @@ def prepare_inputs(input_tensor): # Number of input and output channels @pytest.mark.parametrize("channels", [3, 6]) # memory mode -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py index 32534d4aa5..3e7822a077 100644 --- a/tests/fpgadataflow/test_runtime_weights.py +++ b/tests/fpgadataflow/test_runtime_weights.py @@ -73,7 +73,7 @@ def test_runtime_weights_single_layer(): model = model.transform(SpecializeLayers()) fcl = model.get_nodes_by_op_type("MVAU_hls")[0] op_inst = getCustomOp(fcl) - op_inst.set_nodeattr("mem_mode", "decoupled") + op_inst.set_nodeattr("mem_mode", "internal_decoupled") op_inst.set_nodeattr("runtime_writeable_weights", 1) old_weights = model.get_initializer(fcl.input[1]) op_inst.make_weight_file(old_weights, "decoupled_runtime", "old_weights.dat") diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py index d4901c92ce..d192755d06 100644 --- a/tests/fpgadataflow/test_split_large_fifos.py +++ b/tests/fpgadataflow/test_split_large_fifos.py @@ -86,7 +86,6 @@ def test_split_large_fifos(depth, force_python_rtlsim): build_cfg.DataflowOutputType.STITCHED_IP, build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, ], - default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED, ) build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg) with open(tmp_output_dir + "/report/estimate_network_performance.json") as f: From 4b3737a670932bf31383f54eebe22f188e9ebbb2 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 6 Mar 2024 14:30:44 +0000 Subject: [PATCH 188/291] [NBs] Cleanup advanced builder nb and add placeholder for specialize layer explanation --- .../4_advanced_builder_settings.ipynb | 191 ++++++------------ .../3-build-accelerator-with-finn.ipynb | 2 +- 2 files changed, 60 insertions(+), 133 deletions(-) diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb index e748d85a1c..d9db2c2bc1 100644 --- a/notebooks/advanced/4_advanced_builder_settings.ipynb +++ b/notebooks/advanced/4_advanced_builder_settings.ipynb @@ -9,7 +9,7 @@ "\n", "\"drawing\"\n", "\n", - "In this notebook, we'll use the FINN compiler to generate an FPGA accelerator with a streaming dataflow architecture from a small convolutional network trained on CIFAR-10. The key idea in streaming dataflow architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, illustrated on the figure to the left. You can read more about the general concept in the [FINN](https://arxiv.org/pdf/1612.07119) and [FINN-R](https://dl.acm.org/doi/pdf/10.1145/3242897) papers. This is done by mapping each layer to a Vitis HLS description, parallelizing each layer's implementation to the appropriate degree and using on-chip FIFOs to link up the layers to create the full accelerator.\n", + "In this notebook, we'll use the FINN compiler to generate an FPGA accelerator with a streaming dataflow architecture from a small convolutional network trained on CIFAR-10. The key idea in streaming dataflow architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, illustrated on the figure to the left. You can read more about the general concept in the [FINN](https://arxiv.org/pdf/1612.07119) and [FINN-R](https://dl.acm.org/doi/pdf/10.1145/3242897) papers. This is done by mapping each layer to a Vitis HLS or RTL description, parallelizing each layer's implementation to the appropriate degree and using on-chip FIFOs to link up the layers to create the full accelerator.\n", "These implementations offer a good balance of performance and flexibility, but building them by hand is difficult and time-consuming. This is where the FINN compiler comes in: it can build streaming dataflow accelerators from an ONNX description to match the desired throughput." ] }, @@ -32,9 +32,10 @@ "1. [Introduction to the CNV-w2a2 network](#intro_cnv)\n", "2. [Recap default builder flow](#recap_builder)\n", "3. [Build steps](#build_step)\n", - " 1. [How to make a custom build step](#custom_step)\n", - "4. [Folding configuration json](#folding_config)\n", - "5. [Additional builder arguments](#builder_arg)\n", + " 1. [How to create a custom build step](#custom_step)\n", + "4. [Specialize layers configuration json](#specialize_layers)\n", + "5. [Folding configuration json](#folding_config)\n", + "6. [Additional builder arguments](#builder_arg)\n", " 1. [Verification steps](#verify)\n", " 2. [Other builder arguments](#other_args)\n", " 3. [Examples for additional builder arguments & bitfile generation](#example_args)" @@ -198,7 +199,7 @@ "id": "d746eff3", "metadata": {}, "source": [ - "After each FINN builder step, the graph is saved as .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HLS layers (`step_convert_to_hls`). Then there is a partition created from all layers that were converted to HLS layers (`step_create_dataflow_partition`), then optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step." + "After each FINN builder step, the graph is saved as .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HW abstraction layers (`step_convert_to_hw`). Then there is a partition created from all layers that were converted to HW layers (`step_create_dataflow_partition`), then we convert each of the HW abstraction layers into an HLS or RTL variant (`step_specialize_layers`). Afterwards optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step." ] }, { @@ -217,7 +218,7 @@ "id": "bccebd0d", "metadata": {}, "source": [ - "The analysis of these .onnx files can help us identifying points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HLS layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HLS layers have the module `finn.custom_op.fpgadataflow`." + "The analysis of these .onnx files can help us identifying points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HW layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HW layers have the module `finn.custom_op.fpgadataflow`." ] }, { @@ -227,7 +228,7 @@ "metadata": {}, "outputs": [], "source": [ - "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/step_convert_to_hls.onnx\")" + "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/step_convert_to_hw.onnx\", localhost_url=\"xirxlabs60\")" ] }, { @@ -235,7 +236,7 @@ "id": "2719cc09", "metadata": {}, "source": [ - "As you can see in the graph, the first two nodes (a MultiThreshold and Transpose node) and the last two nodes (a Mul and Add node) are not converted into HLS layers. FINN currently only converts integer only operations into HLS layers, this means only when the input, output & weights are quantized to integer the node will be converted." + "As you can see in the graph, the first two nodes (a MultiThreshold and Transpose node) and the last two nodes (a Mul and Add node) are not converted into HW layers. FINN currently only converts integer only operations into HW layers, this means only when the input, output & weights are quantized to integer the node will be converted." ] }, { @@ -253,7 +254,7 @@ "id": "6e6d942e", "metadata": {}, "source": [ - "When we click on the `global_in` in the graph, we can see that the quantization annotation does not contain a data type. If no data type is set and it can not be derived from the preceeding node, the FINN compiler automatically assumes that the data type is floating point. This is why the first node does not get converted into an HLS layer, the input is assumed to be floating point." + "When we click on the `global_in` in the graph, we can see that the quantization annotation does not contain a data type. If no data type is set and it can not be derived from the preceeding node, the FINN compiler automatically assumes that the data type is floating point. This is why the first node does not get converted into an HW layer, the input is assumed to be floating point." ] }, { @@ -274,7 +275,7 @@ "Even though in the example of the CNVw2a2, the inputs are 32x32 RGB images, so the input values are 8 bit (UINT8) \"quantized\", the input to the exported model is floating point. For training in Brevitas, these values were normalized between 0 and 1.0 and so the exported model expects floating point values as input. \n", "This means we are in scenario 2. In the next section we will develop a custom step for the FINN builder flow to add preprocessing to our network.\n", "\n", - "But before we move to the next section, let's take a look at the last two nodes in the graph that were not converted to HLS layers." + "But before we move to the next section, let's take a look at the last two nodes in the graph that were not converted to HW layers." ] }, { @@ -368,7 +369,7 @@ "id": "e9c2c97f", "metadata": {}, "source": [ - "### How to make a custom build step " + "### How to create a custom build step " ] }, { @@ -439,8 +440,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_target_fps_parallelization\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", @@ -548,8 +550,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_target_fps_parallelization\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", @@ -614,7 +617,7 @@ "id": "5cc97505", "metadata": {}, "source": [ - "Let's have a look at the model after the conversion to hls, to verify that now all layers are correctly converted." + "Let's have a look at the model after the conversion to hw, to verify that now all layers are correctly converted." ] }, { @@ -624,7 +627,7 @@ "metadata": {}, "outputs": [], "source": [ - "showInNetron(build_dir+\"/output_pre_and_post_proc/intermediate_models/step_convert_to_hls.onnx\")" + "showInNetron(build_dir+\"/output_pre_and_post_proc/intermediate_models/step_convert_to_hw.onnx\")" ] }, { @@ -635,6 +638,14 @@ "The model contains now a `Thresholding` layer in the beginning and a `LabelSelect_Batch` layer at the end. Please note, that there is still a `Transpose` node as the first layer of the graph, but we can solve this by converting the input data to the NHWC format before streaming it into the FINN accelerator." ] }, + { + "cell_type": "markdown", + "id": "a6edf5c4-9213-45cd-834f-615c12685d9e", + "metadata": {}, + "source": [ + "## Specialize layers configuration json " + ] + }, { "cell_type": "markdown", "id": "5ffbadd1", @@ -648,7 +659,7 @@ "id": "c164040f", "metadata": {}, "source": [ - "The FINN compiler allows the user to implement a network in streaming dataflow architecture, this means every layer is implemented individually and the data is streamed through the accelerator. We can customize each layer for specific performance and resource requirements by adjusting the parallelism and resource type of each layer. In the FINN context we refer to this customization of parallelism in each layer as folding. To learn more details about the influence of folding factors/parallelism in FINN, please have a look at our [folding tutorial](3_folding.ipynb).\n", + "The FINN compiler allows the user to implement a network in streaming dataflow architecture, this means every layer is implemented individually and the data is streamed through the accelerator. We can customize each layer for specific performance and resource requirements by adjusting the parallelism and resource type of each layer. In the FINN context we refer to this customization of parallelism in each layer as folding. To learn more details about the influence of folding factors/parallelism in FINN, please have a look at our [folding tutorial](./3_folding.ipynb).\n", "\n", "In this section, we will look into the interface over which we can influence the customization of each layer using the FINN builder tool: A json file containing the folding configuration." ] @@ -683,7 +694,7 @@ "source": [ "As you can see from the printed cell above, the keys in the .json file are the node names of the layers in our network. For each of the layers, some node attributes are listed:\n", "* `PE` and `SIMD` are the folding parameters that determine the parallelism of each layer, depending on the layer they can be set to different values, for details refer to [this table](https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer).\n", - "* `mem_mode`: determines if the parameter memory will be implemented as part of the HLS code (`const`) or instantiated separately and connected with the layer over a memory streamer unit (`decoupled`). You can find more details in this part of the documentation: https://finn-dev.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode . It is also possible to set the mem_mode to external which allows for the implementation for external weights.\n", + "* `mem_mode`: determines if the parameter memory will be implemented as part of the HLS/RTL code (`const`) or instantiated separately and connected with the layer over a memory streamer unit (`decoupled`). You can find more details in this part of the documentation: https://finn-dev.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode . It is also possible to set the mem_mode to external which allows for the implementation for external weights.\n", "* `ram_style`: when selecting `decoupled` mode, the FINN compiler allows us to choose which memory resource will be used for the layer. The argument `ram_style` is set to the selected memory type:\n", " * `auto`: Vivado will make the decision if the implementation is using LUTRAM or BRAM\n", " * `distributed`: LUTRAM will be used\n", @@ -795,8 +806,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", " \"step_generate_estimate_reports\",\n", @@ -899,8 +911,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", " \"step_generate_estimate_reports\",\n", @@ -937,7 +950,7 @@ "metadata": {}, "outputs": [], "source": [ - "showInNetron(build_dir+\"/output_all_bram/intermediate_models/step_generate_estimate_reports.onnx\")" + "showInNetron(build_dir+\"/output_all_bram/intermediate_models/step_generate_estimate_reports.onnx\", localhost_url=\"xirxlabs60\")" ] }, { @@ -958,7 +971,7 @@ "id": "97f87780", "metadata": {}, "source": [ - "The initial implementation already had a high utilization of BRAM, but the estimations went now up to 522 BRAMs while the LUT count went down to ~99k." + "The initial implementation already had a high utilization of BRAM, but the estimations went now up to ~500 BRAMs while the LUT count went down to ~99k." ] }, { @@ -1103,8 +1116,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_target_fps_parallelization\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", @@ -1239,7 +1253,7 @@ "source": [ "There are attributes that come from the dataclasses-json class: `to_dict`, `to_json`, `schema`, `from_json`, `from_dict`. This class is used for the implementation of the FINN builder. In this tutorial, we are mainly interested in the FINN specific arguments. \n", "\n", - "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. target_fps, fpga_part and folding_config_file. In the code of the FINN builder, the function of each builder argument is documents, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments." + "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. `target_fps`, `fpga_part` and `folding_config_file`. In the code of the FINN builder, the function of each builder argument is documents, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments." ] }, { @@ -1267,7 +1281,7 @@ "id": "b9bc5715", "metadata": {}, "source": [ - "You can see that after the generation of the estimate reports, the code generation and the ip generation is invoked (`step_hls_codegen` and `step_hls_ipgen`). The FIFO depths are determined and the FIFOs are inserted in the network (`step_set_fifo_depths`), we can then create an IP design of our whole network by stitching the IPs from each layer together (`step_create_stitched_ip`). At this point we have an implementation of the neural network that we can integrate within a bigger FPGA design, we can run performance measurements using simulation (`step_measure_rtlsim_performance`) and out-of-context synthesis (`step_out_of_context_synthesis`) for it.\n", + "You can see that after the generation of the estimate reports, the code generation and the ip generation is invoked (`step_hw_codegen` and `step_hw_ipgen`). The FIFO depths are determined and the FIFOs are inserted in the network (`step_set_fifo_depths`), we can then create an IP design of our whole network by stitching the IPs from each layer together (`step_create_stitched_ip`). At this point we have an implementation of the neural network that we can integrate within a bigger FPGA design, we can run performance measurements using simulation (`step_measure_rtlsim_performance`) and out-of-context synthesis (`step_out_of_context_synthesis`) for it.\n", "The FINN builder also provides automatic system integration for Zynq and Alveo devices, this can be invoked by running `step_synthesize_bitfile`, `step_make_pynq_driver` and `step_deployment_package`." ] }, @@ -1287,7 +1301,7 @@ "outputs": [], "source": [ "import finn.builder.build_dataflow_steps as build_dataflow_steps\n", - "print(build_dataflow_steps.step_hls_codegen.__doc__)" + "print(build_dataflow_steps.step_hw_codegen.__doc__)" ] }, { @@ -1297,7 +1311,7 @@ "metadata": {}, "outputs": [], "source": [ - "showSrc(build_dataflow_steps.step_hls_codegen)" + "showSrc(build_dataflow_steps.step_hw_codegen)" ] }, { @@ -1313,7 +1327,7 @@ "id": "3b98eb65", "metadata": {}, "source": [ - "### Examples for additional builder arguments & bitfile generation " + "### Example for additional builder arguments & bitfile generation " ] }, { @@ -1334,7 +1348,7 @@ "* A matrix multiplication\n", "* A MultiThreshold operation\n", "\n", - "When converting these nodes into HLS layers, by default the MatMul and the MultiThreshold gets converted into **one** component called Matrix-Vector-Activation Unit (MVAU). But the FINN compiler allows us to implement the activation separately. This gives an additional possibility for customization because we can adjust the folding parameters of the standalone threshold unit independently. \n", + "When converting these nodes into HW layers, by default the MatMul and the MultiThreshold gets converted into **one** component called Matrix-Vector-Activation Unit (MVAU). But the FINN compiler allows us to implement the activation separately. This gives an additional possibility for customization because we can adjust the folding parameters of the standalone threshold unit independently. \n", "\n", "If you would like to enable this feature, you can set the build argument `standalone_thresholds` to `True`. In the code below this feature is enabled and you can have a look at the generated .onnx file. Please note that you need to uncomment the code first." ] @@ -1365,8 +1379,9 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_target_fps_parallelization\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", @@ -1408,103 +1423,6 @@ "#showInNetron(build_dir+\"/output_standalone_thresholds/intermediate_models/step_generate_estimate_reports.onnx\")" ] }, - { - "cell_type": "markdown", - "id": "074d8253", - "metadata": {}, - "source": [ - "#### RTL Convolutional Input Generator" - ] - }, - { - "cell_type": "markdown", - "id": "b85e5ac7", - "metadata": {}, - "source": [ - "Recently, we have worked on the *Operator Hardening* in the FINN compiler. This means that we implement core building blocks in RTL instead of using HLS.\n", - "One of these components is already available in the FINN compiler, you can enable the RTL implementation of the ConvolutionInputGenerator (aka Sliding Window Generator) by setting the build argument `force_rtl_conv_inp_gen` to `True`.\n", - "In the code below this feature is enabled and you can have a look at the generated .onnx file. Please note that you need to uncomment the code first." - ] - }, - { - "cell_type": "markdown", - "id": "2a90b63f", - "metadata": {}, - "source": [ - "
\n", - "Important notice: We are actively working on the integration of RTL components in the FINN flow, the enablement like shown below might change in the future.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab0c4974", - "metadata": {}, - "outputs": [], - "source": [ - "## Build flow with additional builder arguments enabled\n", - "## force_rtl_conv_inp_gen = True\n", - "\n", - "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", - "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", - "\n", - "output_dir = build_dir + \"/output_rtl_swg\"\n", - "\n", - "#Delete previous run results if exist\n", - "if os.path.exists(output_dir):\n", - " shutil.rmtree(output_dir)\n", - " print(\"Previous run results deleted!\")\n", - "\n", - "build_steps = [\n", - " custom_step_add_pre_proc,\n", - " custom_step_add_post_proc,\n", - " \"step_qonnx_to_finn\",\n", - " \"step_tidy_up\",\n", - " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", - " \"step_create_dataflow_partition\",\n", - " \"step_target_fps_parallelization\",\n", - " \"step_apply_folding_config\",\n", - " \"step_minimize_bit_width\",\n", - " \"step_generate_estimate_reports\",\n", - "]\n", - "\n", - "cfg_estimates = build.DataflowBuildConfig(\n", - " output_dir = output_dir,\n", - " mvau_wwidth_max = 80,\n", - " target_fps = 10000,\n", - " synth_clk_period_ns = 10.0,\n", - " fpga_part = \"xc7z020clg400-1\",\n", - " force_rtl_conv_inp_gen = True,\n", - " steps = build_steps,\n", - " generate_outputs=[\n", - " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", - " ],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19fe4d85", - "metadata": {}, - "outputs": [], - "source": [ - "#%%time\n", - "#build.build_dataflow_cfg(model_file, cfg_estimates);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c1f1ce9", - "metadata": {}, - "outputs": [], - "source": [ - "#showInNetron(build_dir+\"/output_rtl_swg/intermediate_models/step_generate_estimate_reports.onnx\")" - ] - }, { "cell_type": "markdown", "id": "601eb5f8", @@ -1569,14 +1487,15 @@ " \"step_qonnx_to_finn\",\n", " \"step_tidy_up\",\n", " \"step_streamline\",\n", - " \"step_convert_to_hls\",\n", + " \"step_convert_to_hw\",\n", " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", " \"step_target_fps_parallelization\",\n", " \"step_apply_folding_config\",\n", " \"step_minimize_bit_width\",\n", " \"step_generate_estimate_reports\",\n", - " \"step_hls_codegen\",\n", - " \"step_hls_ipgen\",\n", + " \"step_hw_codegen\",\n", + " \"step_hw_ipgen\",\n", " \"step_set_fifo_depths\",\n", " \"step_create_stitched_ip\",\n", " \"step_measure_rtlsim_performance\",\n", @@ -1613,9 +1532,17 @@ "metadata": {}, "outputs": [], "source": [ - "#%%time\n", - "#build.build_dataflow_cfg(model_file, cfg_build);" + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_build);" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3eccb045-13b8-410b-bfcb-9e9c7146a1b4", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -1634,7 +1561,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb index 5e8bff3e04..73cd25cf20 100644 --- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb +++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb @@ -265,7 +265,7 @@ "\n", "**Live FINN tutorial:** These next builds will take about 10 minutes to complete since multiple calls to Vivado and a call to RTL simulation are involved. While this is running, you can examine the generated files with noVNC -- it is running on **(your AWS URL):6080/vnc.html**\n", "\n", - "* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_MatrixVectorActivation_XXXXXX`\n", + "* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_MVAU_hls_XXXXXX`\n", " \n", "* Once the `step_create_stitched_ip [11/16]` below is completed, you can view the generated stitched IP in Vivado under `/home/ubuntu/finn/notebooks/end2end_example/cybersecurity/output_ipstitch_ooc_rtlsim/stitched_ip`\n", " " From c79f364243bd5d2346b413589c1e7a078b6f99df Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Wed, 6 Mar 2024 15:51:04 +0000 Subject: [PATCH 189/291] [Threshold RTL] Remove redundent functions Signed-off-by: aziz bahri --- .../fpgadataflow/rtl/thresholding_rtl.py | 1 - src/finn/util/basic.py | 20 ------------------- 2 files changed, 21 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index ee101b1cc8..a7161a59bb 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -37,7 +37,6 @@ from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.custom_op.fpgadataflow.thresholding import Thresholding from finn.util.basic import ( - find_next_power_of_2, get_memutil_alternatives, get_rtlsim_trace_depth, make_build_dir, diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 10edb7dc54..a80abfc876 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -235,26 +235,6 @@ def is_exe(fpath): return None - -def find_next_power_of_2(n): - """For any integer 'n', find the next greatest power of 2""" - # Negative values will loop infinitely below - return 0 - if n <= 0: - return 0 - # If '1' is requested, output will be '0' in the loop below, avoid this now. - elif n == 1: - return 2 # i.e. 2**1 - - # decrement 'n' (to handle cases when `n` itself is a power of 2) - n = n - 1 - - # loop until only one bit is left - while n & n - 1: - # unset rightmost bit - n = n & n - 1 - return n << 1 - - mem_primitives_versal = { "URAM_72x4096": (72, 4096), "URAM_36x8192": (36, 8192), From 999ed82c4857fbfd0f963437fe22b0f1f7823b87 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 6 Mar 2024 23:23:38 +0000 Subject: [PATCH 190/291] [mvau]: renamed method to more generic name --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index dc713c8b42..138cd9f3ad 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -476,7 +476,7 @@ def minimize_accumulator_width(self, model): # if the thresholds can be used to determine range, then adjust the range # according to the known values of the thresholds if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) # set threshold datatype (and accumulator datatype implicitly) min_threshold = thresholds.min() max_threshold = thresholds.max() @@ -485,7 +485,7 @@ def minimize_accumulator_width(self, model): warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) thresholds = np.clip(thresholds, acc_min, acc_max) model.set_initializer(self.onnx_node.input[2], thresholds) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) min_threshold = thresholds.min() max_threshold = thresholds.max() acc_min = min(min_threshold, acc_min) @@ -762,7 +762,7 @@ def generate_params(self, model, path): if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) # use UINT32 threshold export for bipolar times bipolar inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] From 209b81ce767763d2c34a8c109e7435e2d75fc791 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 6 Mar 2024 23:24:23 +0000 Subject: [PATCH 191/291] [rtl mvau]: add CPPsim functionality (fall back to MVAU exec) --- .../rtl/matrixvectoractivation_rtl.py | 117 ++++++++---------- 1 file changed, 55 insertions(+), 62 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index ae04b003bd..425d1b4e15 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -68,9 +68,63 @@ def execute_node(self, context, graph): node = self.onnx_node if mode == "cppsim": - raise Exception("cppsim not possible for RTL MVAU, please set exec_mode to rtlsim") + MVAU.execute_node(self, context, graph) elif mode == "rtlsim": code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for MatrixVectorActivation_rtl") + in_ind += 1 + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + reset_rtlsim(sim) + toggle_clk(sim) + if mem_mode in ["external", "decoupled"]: + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} @@ -79,67 +133,6 @@ def execute_node(self, context, graph): ) ) - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for MatrixVectorActivation_rtl") - in_ind += 1 - - if mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - reset_rtlsim(sim) - toggle_clk(sim) - if mem_mode in ["external", "decoupled"]: - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to "rtlsim" """.format( - mode - ) - ) - def lut_estimation(self): return 0 From 0f216a7455978ea83eb961a0c2f06726a05b7959 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 6 Mar 2024 23:28:27 +0000 Subject: [PATCH 192/291] [specialize layers]: minor bugfix and removed VVU-related support --- .../fpgadataflow/specialize_layers.py | 42 +++++-------------- 1 file changed, 10 insertions(+), 32 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 4027b0c949..191d84a8d3 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -26,7 +26,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import numpy as np import warnings from onnx import helper from qonnx.core.datatype import DataType @@ -35,7 +34,6 @@ from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants -from finn.util.fpgadataflow import is_versal def _determine_impl_style(node, fpgapart=""): @@ -55,10 +53,10 @@ def _determine_impl_style(node, fpgapart=""): if optype == "StreamingDataWidthConverter": return _dwc_determine_impl_style(node) if rtl_variant: - impl_style = "rtl" + return "rtl" # but if no rtl variant, set impl_style to hls elif hls_variant: - impl_style = "hls" + return "hls" # if there is neither an rtl nor hls variant # throw error else: @@ -126,15 +124,6 @@ def _determine_impl_style(node, fpgapart=""): node.name, ) warnings.warn(warn_str) - elif optype == "VectorVectorActivation": - if _vvu_rtl_possible(node, fpgapart): - return "rtl" - else: - warn_str = """There is no RTL variant for %s. The node will automatically be - set to HLS variant.""" % ( - node.name, - ) - warnings.warn(warn_str) if rtl_variant: return "rtl" @@ -221,27 +210,16 @@ def _mvu_rtl_possible(n): folding_supported = ( getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0 ) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0) + targets_dsp = getCustomOp(n).get_nodeattr("resType") in ["dsp", "auto"] + external_memmode = getCustomOp(n).get_nodeattr("mem_mode") in ["decoupled", "external"] - return act_width_in_range and weight_width_in_range and folding_supported - - -def _vvu_rtl_possible(n, fpgapart): - # Checks whether RTL-based VVU is supported - act_width_in_range = ( - DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 - ) or ( - DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 - and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 + return ( + act_width_in_range + and weight_width_in_range + and folding_supported + and targets_dsp + and external_memmode ) - weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 - folding_supported = ( - getCustomOp(n).get_nodeattr("Channels") % getCustomOp(n).get_nodeattr("PE") == 0 - ) and ( - np.prod(getCustomOp(n).get_nodeattr("Kernel")) % getCustomOp(n).get_nodeattr("SIMD") == 0 - ) - is_versal_family = is_versal(fpgapart) - - return act_width_in_range and weight_width_in_range and folding_supported and is_versal_family class SpecializeLayers(Transformation): From dd0369c3752b04262a4b5a880558cb1f2b79a10f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 6 Mar 2024 23:30:11 +0000 Subject: [PATCH 193/291] [test]: added RTL-MVAU CPPsim test --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 22 +++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 32edf36365..03f1293b74 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -654,10 +654,14 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): model = model.transform(to_hw.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) model = model.transform(GiveUniqueNodeNames()) + # Apply convert-to-rtl step + model = model.transform(SpecializeLayers(part)) + model = model.transform(GiveUniqueNodeNames()) + # Apply folding (i.e. specify to use DSPs) folding_config = { "Defaults": {}, - "MVAU_0": { + "MVAU_rtl_0": { "PE": pe, "SIMD": simd, "mem_mode": "decoupled", @@ -671,9 +675,16 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): # make sure the changed datatypes are propagated through the network model = model.transform(InferDataTypes()) - # Apply convert-to-rtl step - model = model.transform(SpecializeLayers(part)) - model = model.transform(GiveUniqueNodeNames()) + # Run CPPsim + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + output_mvau_hls = oxe.execute_onnx(model, input_dict)["global_out"] + assert ( + output_matmul == output_mvau_hls + ).all(), "Output of ONNX model not matching output of node-by-node CPPsim!" + + # Run node-by-node RTLsim model = model.transform(SetExecMode("rtlsim")) model = model.transform(PrepareIP(part, clk_ns)) model = model.transform(HLSSynthIP()) @@ -682,8 +693,9 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): assert ( output_matmul == output_mvau_rtl - ).all(), "Output of ONNX model not matching output of node-by-node sim!" + ).all(), "Output of ONNX model not matching output of node-by-node RTLsim!" + # Run stitched-ip RTLsim model = model.transform(InsertAndSetFIFODepths(part, clk_ns)) model = model.transform(PrepareIP(part, clk_ns)) model = model.transform(HLSSynthIP()) From 41b76150ff9402e751d2cf3d17f8fc8956ec488e Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 7 Mar 2024 09:04:25 +0000 Subject: [PATCH 194/291] [tests] remove util test --- tests/util/test_basic.py | 60 ---------------------------------------- 1 file changed, 60 deletions(-) delete mode 100755 tests/util/test_basic.py diff --git a/tests/util/test_basic.py b/tests/util/test_basic.py deleted file mode 100755 index 97a8c50261..0000000000 --- a/tests/util/test_basic.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (C) 2023, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -import finn.util.basic as basic - - -@pytest.mark.util -def test_next_power_of_2(): - test_vector = [ - {"input": -2, "expected_result": 0}, - {"input": -1, "expected_result": 0}, - {"input": 0, "expected_result": 0}, - {"input": 1, "expected_result": 2}, - {"input": 2, "expected_result": 2}, - {"input": 3, "expected_result": 4}, - {"input": 4, "expected_result": 4}, - {"input": 7, "expected_result": 8}, - {"input": 8, "expected_result": 8}, - {"input": 11, "expected_result": 16}, - {"input": 15, "expected_result": 16}, - {"input": 16, "expected_result": 16}, - {"input": 18, "expected_result": 32}, - {"input": 27, "expected_result": 32}, - {"input": 31, "expected_result": 32}, - {"input": 32, "expected_result": 32}, - {"input": 42, "expected_result": 64}, - {"input": 65, "expected_result": 128}, - ] - - for test_dict in test_vector: - output = basic.find_next_power_of_2(test_dict["input"]) - assert output >= test_dict["input"] - assert output == test_dict["expected_result"] From 216cb0d549467c9856083fa97ff694adb6871890 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 7 Mar 2024 14:49:09 +0000 Subject: [PATCH 195/291] [tests] Dont skip BIPOLAR test for thresholding Signed-off-by: aziz bahri --- .../fpgadataflow/convert_to_hw_layers.py | 10 +++++++--- tests/fpgadataflow/test_convert_to_hw_thresholding.py | 8 -------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index fdb892e911..c1d7dbc298 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -236,9 +236,13 @@ def apply(self, model): node.name + ": MultiThreshold out_bias must be integer for HLS conversion." ) actval = int(actval) - assert (not odt.signed()) or (actval < 0), ( - node.name + ": Signed output requires actval < 0" - ) + + # a signed activation should always have a negative bias, + # but BIPOLAR uses the -1 as 0 encoding so the assert does not apply + if odt != "BIPOLAR": + assert (not odt.signed()) or (actval < 0), ( + node.name + ": Signed output requires actval < 0" + ) new_node = helper.make_node( "Thresholding", diff --git a/tests/fpgadataflow/test_convert_to_hw_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py index 9d44702152..ef08d87846 100755 --- a/tests/fpgadataflow/test_convert_to_hw_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hw_thresholding.py @@ -144,14 +144,6 @@ def test_convert_multithreshold_to_hardware( pe = generate_pe_value(fold, num_input_channels) num_steps = activation.get_num_possible_values() - 1 - # See convert_to_hw_layers::InferThresholdingLayer: - # assert (not odt.signed()) or (actval < 0) - # This implies that it expects a negative activation, BIPOLAR does not provide that - if activation == DataType["BIPOLAR"]: - pytest.skip( - "Only negative activations are supported for " "RTL Thresholding Binary Search node" - ) - # Other non-input parameters num_input_vecs = [1, 2, 2] output_data_type = activation From 00147454b4e2812d29c09d6eee89906bd8afd9cf Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 7 Mar 2024 15:01:48 +0000 Subject: [PATCH 196/291] [Thresholding] bipolar type do not require negative activation Signed-off-by: aziz bahri --- src/finn/transformation/fpgadataflow/convert_to_hw_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index c1d7dbc298..27f257b917 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -239,7 +239,7 @@ def apply(self, model): # a signed activation should always have a negative bias, # but BIPOLAR uses the -1 as 0 encoding so the assert does not apply - if odt != "BIPOLAR": + if odt != DataType["BIPOLAR"]: assert (not odt.signed()) or (actval < 0), ( node.name + ": Signed output requires actval < 0" ) From 95b51bad6cc1ffe423e0f1ae1cb5509a1a35b1b2 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Thu, 7 Mar 2024 15:14:34 +0000 Subject: [PATCH 197/291] [refactor] linting --- .../fpgadataflow/rtl/thresholding_rtl.py | 37 ++++++++++--------- src/finn/util/basic.py | 1 + .../test_convert_to_hw_thresholding.py | 12 ++++-- .../test_fpgadataflow_thresholding.py | 4 +- 4 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index a7161a59bb..f30a305dfe 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -79,20 +79,21 @@ def get_nodeattr_types(self): return my_attrs def get_pe_mem_geometries(self): - ''' return a list of (bitwidth, depth) for PE memory configurations to be used in resource estimation - - for each bitwidth, the depth is calculated as the - number of thresholds that can be stored in a single - memory block - the bitwidth is the bitwidth of the threshold values - the depth is the number of thresholds that can be stored - in a single memory block - the number of memory blocks is calculated as the number - of thresholds divided by the depth - the number of memory blocks is then multiplied by the - number of PEs to get the total number of memory blocks - required for the entire layer - ''' + """return a list of (bitwidth, depth) for PE memory configurations to be used + in resource estimation + + for each bitwidth, the depth is calculated as the + number of thresholds that can be stored in a single + memory block + the bitwidth is the bitwidth of the threshold values + the depth is the number of thresholds that can be stored + in a single memory block + the number of memory blocks is calculated as the number + of thresholds divided by the depth + the number of memory blocks is then multiplied by the + number of PEs to get the total number of memory blocks + required for the entire layer + """ pe = self.get_nodeattr("PE") wdt = self.get_weight_datatype() wdt_bits = wdt.bitwidth() @@ -108,7 +109,7 @@ def get_pe_mem_geometries(self): return ret def get_memory_estimate(self): - ''' return the memory estimate for this node ''' + """return the memory estimate for this node""" res_dict = {} depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") @@ -130,17 +131,17 @@ def get_memory_estimate(self): return res_dict def bram_estimation(self): - ''' return the number of BRAMs required for this node ''' + """return the number of BRAMs required for this node""" res_dict = self.get_memory_estimate() return res_dict.get("BRAM", 0) def uram_estimation(self): - ''' return the number of URAMs required for this node ''' + """return the number of URAMs required for this node""" res_dict = self.get_memory_estimate() return res_dict.get("URAM", 0) def lut_estimation(self): - ''' return the number of LUTs required for this node ''' + """return the number of LUTs required for this node""" res_dict = self.get_memory_estimate() return res_dict.get("LUTRAM", 0) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index a80abfc876..1995d9f06a 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -235,6 +235,7 @@ def is_exe(fpath): return None + mem_primitives_versal = { "URAM_72x4096": (72, 4096), "URAM_36x8192": (36, 8192), diff --git a/tests/fpgadataflow/test_convert_to_hw_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py index ef08d87846..63cb5986e1 100755 --- a/tests/fpgadataflow/test_convert_to_hw_thresholding.py +++ b/tests/fpgadataflow/test_convert_to_hw_thresholding.py @@ -32,15 +32,16 @@ from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.basic import gen_finn_dt_tensor -from qonnx.custom_op.general.multithreshold import multithreshold + +import finn.core.onnx_exec as oxe from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -import finn.core.onnx_exec as oxe test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -50,19 +51,23 @@ def sort_thresholds_increasing(thresholds): return np.sort(thresholds, axis=1) + def prepare_inputs(input_tensor): return {"inp": input_tensor} + # n = batch, c = channel, h = height, w = width of feature map # Standard = NCHW; FINN = NHWC # Convert from NHWC(FINN) to NCHW(Standard) def layout_FINN2NCHW(data): return np.transpose(data, (0, 3, 1, 2)) + # Convert from NCHW(Standard) to NHWC(FINN) def layout_NCHW2FINN(data): return np.transpose(data, (0, 2, 3, 1)) + def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): return np.random.randint( input_data_type.min(), @@ -190,7 +195,8 @@ def test_convert_multithreshold_to_hardware( assert (y_produced == y_expected).all() - # Transform to the specified implementation style, either the RTL or HLS according to test parameters + # Transform to the specified implementation style, either the + # RTL or HLS according to test parameters node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] inst = getCustomOp(node) inst.set_nodeattr("preferred_impl_style", impl_style) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 2a316e6c1b..fc3996ddab 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -81,9 +81,7 @@ def layout_NCHW2FINN(data): return np.transpose(data, (0, 2, 3, 1)) -def make_single_thresholding_modelwrapper( - impl_style, T, idt, odt, actval, n_inp_vecs -): +def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs): NumChannels = T.shape[0] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]) From 0d240f63b378f64a205a5d0b1bdf8ebe65d4e07d Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 7 Mar 2024 15:26:04 +0000 Subject: [PATCH 198/291] [rtl mvau]: added methods related to RTL file retrieval and corrected DSP estimations --- .../rtl/matrixvectoractivation_rtl.py | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index 425d1b4e15..4f17aab5fd 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -139,17 +139,11 @@ def lut_estimation(self): def dsp_estimation(self): # multiplication P = self.get_nodeattr("PE") - res_type = self.get_nodeattr("resType") Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - idt = self.get_input_datatype() - A = idt.bitwidth() - if res_type == "dsp": - mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling - else: - mult_dsp = 0 - return int(mult_dsp) + dsp_res = {} + dsp_res["DSP48"] = np.ceil(P / 4) * Q + dsp_res["DSP58"] = P * np.ceil(Q / 3) + return dsp_res def code_generation_ipgen(self, model, fpgapart, clk): self.generate_hdl(model, fpgapart, clk) @@ -258,7 +252,6 @@ def prepare_codegen_default(self, fpgapart, clk): code_gen_dict = {} code_gen_dict["$IS_MVU$"] = [str(1)] code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] - # code_gen_dict["$PUMPED_COMPUTE$"] = [str(0)] code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] @@ -298,3 +291,19 @@ def prepare_rtlsim(self): self.set_nodeattr("rtlsim_so", sim.lib._name) return sim + + def get_all_verilog_paths(self): + "Return list of all folders containing Verilog code for this node." + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + return verilog_paths + + def get_verilog_top_filename(self): + "Return the Verilog top module filename for this node." + + verilog_file = "{}/{}_wrapper.v".format( + self.get_nodeattr("code_gen_dir_ipgen"), self.get_nodeattr("gen_top_module") + ) + return verilog_file From 943dcf3b03125d057f73403db1aeae7db1a5927f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 7 Mar 2024 15:33:30 +0000 Subject: [PATCH 199/291] updated copyright header --- finn-rtllib/mvu/mvu_4sx4u.sv | 33 ++++++ finn-rtllib/mvu/mvu_8sx8u_dsp48.sv | 33 ++++++ finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 4 +- finn-rtllib/mvu/mvu_vvu_axi.sv | 9 +- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +- finn-rtllib/mvu/tb/mvu_8sx9_tb.sv | 165 ++++++++++++++++++++++++++ finn-rtllib/mvu/tb/mvu_axi_tb.sv | 4 +- finn-rtllib/mvu/tb/mvu_dsp58_tb.sv | 142 ++++++++++++++++++++++ finn-rtllib/mvu/tb/vvu_axi_tb.sv | 10 +- 9 files changed, 387 insertions(+), 15 deletions(-) create mode 100644 finn-rtllib/mvu/tb/mvu_8sx9_tb.sv create mode 100644 finn-rtllib/mvu/tb/mvu_dsp58_tb.sv diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv index aafe0e3429..0ac2628ee5 100644 --- a/finn-rtllib/mvu/mvu_4sx4u.sv +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -1,3 +1,36 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP48. + *****************************************************************************/ + module mvu_4sx4u #( int unsigned PE, int unsigned SIMD, diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv index 1423153c97..fbf48784f0 100644 --- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -1,3 +1,36 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP48. + *****************************************************************************/ + module mvu_8sx8u_dsp48 #( int unsigned PE, int unsigned SIMD, diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv index 53cf71fd5f..2cc6cf1bcf 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,7 +28,7 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP58. + * @brief Matrix/Vector Vector Unit (MVU/VVU) core compute kernel utilizing DSP58. *****************************************************************************/ module mvu_vvu_8sx9_dsp58 #( diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 91e3b77216..d7b16319c8 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,10 +31,9 @@ * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. * @details * The following compute cores are supported: - * - 4-bit MVU on DSP48 & DSP58 achieving 4 MACs/DSP, - * (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, - * [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, - * 'unconstrained' LUT-based MVU and VVU. + * - 4-bit MVU on DSP48 achieving 4 MACs/DSP, + * - (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, + * - [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, * Folding hints: * - PE scaling should divide MH. * - SIMD scaling should divide MW. diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index ee067fa8b5..936f2ce0fc 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv new file mode 100644 index 0000000000..c8bfe5370a --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv @@ -0,0 +1,165 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + +module mvu_8sx9_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + +endmodule : mvu_8sx9_tb diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index 2f35a112ab..51bf623831 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,7 +28,7 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * @brief Testbench for MVU AXI-lite interface wrapper. + * @brief Testbench for MVU AXI wrapper module. *****************************************************************************/ module mvu_axi_tb(); diff --git a/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv new file mode 100644 index 0000000000..108980c497 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv @@ -0,0 +1,142 @@ +module mvu_dsp58_tb; + + localparam int unsigned N = 1000; + + localparam int unsigned MW = 12; + localparam int unsigned MH = 4; + localparam int unsigned PE = 2; + localparam int unsigned SIMD = 6; + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 8; + localparam int unsigned ACCU_WIDTH = 24; + + //- Global Control ------------------ + logic clk = 1; + logic clk2x = 1; + always #5ns clk = !clk; + always #2.5ns clk2x = !clk2x; + + logic rst = 1; + initial begin + repeat(8) @(posedge clk); + rst <= 0; + end + + //- DUTs ---------------------------- + + // Weight Stream + logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] s_axis_weights_tdata; + logic s_axis_weights_tvalid[2]; + uwire s_axis_weights_tready[2]; + + // Input Stream + logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] s_axis_input_tdata; + logic s_axis_input_tvalid[2]; + uwire s_axis_input_tready[2]; + + // Output Stream + uwire [PE-1:0][ACCU_WIDTH-1:0] m_axis_output_tdata[2]; + uwire m_axis_output_tvalid[2]; + logic m_axis_output_tready[2]; + + for(genvar i = 0; i < 2; i++) begin : genDUTs + mvu_vvu_axi #( + .IS_MVU(1), + .COMPUTE_CORE("mvu_vvu_8sx9_dsp58"), + .MW(MW), .MH(MH), + .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .PUMPED_COMPUTE(i) + ) dut ( + .ap_clk(clk), .ap_clk2x(clk2x), .ap_rst_n(!rst), + .s_axis_weights_tdata, .s_axis_weights_tvalid(s_axis_weights_tvalid[i]), .s_axis_weights_tready(s_axis_weights_tready[i]), + .s_axis_input_tdata, .s_axis_input_tvalid (s_axis_input_tvalid [i]), .s_axis_input_tready (s_axis_input_tready [i]), + .m_axis_output_tdata(m_axis_output_tdata[i]), .m_axis_output_tvalid (m_axis_output_tvalid [i]), .m_axis_output_tready (m_axis_output_tready [i]) + ); + end : genDUTs + + + //- Stimuli ------------------------- + + // Weight Feed + initial begin + s_axis_weights_tvalid = '{ default: 0 }; + s_axis_weights_tdata = 'x; + @(posedge clk iff !rst); + + repeat(N * (MH/PE)*(MW/SIMD)) begin + automatic type(s_axis_weights_tdata) weights; + std::randomize(weights); + s_axis_weights_tdata <= weights; + s_axis_weights_tvalid <= '{ default: 1 }; + fork + begin + @(posedge clk iff s_axis_weights_tready[0]); + s_axis_weights_tvalid[0] <= 0; + end + begin + @(posedge clk iff s_axis_weights_tready[1]); + s_axis_weights_tvalid[1] <= 0; + end + join + end + end + + // Input Feed + initial begin + s_axis_input_tvalid = '{ default: 0 }; + s_axis_input_tdata = 'x; + @(posedge clk iff !rst); + + repeat(N * (MW/SIMD)) begin + automatic type(s_axis_input_tdata) in; + std::randomize(in); + s_axis_input_tdata <= in; + s_axis_input_tvalid <= '{ default: 1 }; + fork + begin + @(posedge clk iff s_axis_input_tready[0]); + s_axis_input_tvalid[0] <= 0; + end + begin + @(posedge clk iff s_axis_input_tready[1]); + s_axis_input_tvalid[1] <= 0; + end + join + end + end + + // Output Capture and Comparison + initial begin + m_axis_output_tready = '{ default: 0 }; + @(posedge clk iff !rst); + + repeat(N * (MH/PE)) begin + automatic type(m_axis_output_tdata) res; + m_axis_output_tready <= '{ default: 1 }; + fork + begin + @(posedge clk iff m_axis_output_tvalid[0]); + m_axis_output_tready[0] <= 0; + res[0] = m_axis_output_tdata[0]; + end + begin + @(posedge clk iff m_axis_output_tvalid[1]); + m_axis_output_tready[1] <= 0; + res[1] = m_axis_output_tdata[1]; + end + join + assert(res[0] == res[1]) else begin + $error("Output mismatch: %0x <=> %0x", res[0], res[1]); + $stop; + end + while($urandom()%7 < MW/SIMD) @(posedge clk); // Occassional backpressure + end + + $display("Test completed."); + $finish; + end + +endmodule : mvu_dsp58_tb diff --git a/finn-rtllib/mvu/tb/vvu_axi_tb.sv b/finn-rtllib/mvu/tb/vvu_axi_tb.sv index fbb45845e1..853dcc6e17 100644 --- a/finn-rtllib/mvu/tb/vvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/vvu_axi_tb.sv @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (C) 2022, Advanced Micro Devices, Inc. + * Copyright (C) 2024, Advanced Micro Devices, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,7 +28,7 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * @brief Testbench for MVU AXI-lite interface wrapper. + * @brief Testbench for VVU AXI wrapper module. *****************************************************************************/ module vvu_axi_tb(); @@ -39,9 +39,9 @@ module vvu_axi_tb(); localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; localparam int unsigned MW = 25; // Kernel*Kernel localparam int unsigned MH = 4; // Channels - localparam int unsigned SIMD = 25; // MW%SIMD == 0 - localparam int unsigned PE = 2; // MH%PE == 0 - localparam int unsigned SEGMENTLEN = 3.0; + localparam int unsigned SIMD = 1; // MW%SIMD == 0 + localparam int unsigned PE = 1; // MH%PE == 0 + localparam int unsigned SEGMENTLEN = 1.0; localparam bit FORCE_BEHAVIORAL = 1; localparam bit M_REG_LUT = 1; // Bit-width config From dde16a9e8029497d3a161b94239e4a0bcfafea0f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 7 Mar 2024 15:36:41 +0000 Subject: [PATCH 200/291] [transform]: renamed variable --- src/finn/transformation/fpgadataflow/specialize_layers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 191d84a8d3..29921a97f5 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -200,7 +200,7 @@ def _swg_hls_possible(node): def _mvu_rtl_possible(n): # Checks whether RTL-based MVU is supported - act_width_in_range = ( + inp_width_in_range = ( DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 ) or ( DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 @@ -214,7 +214,7 @@ def _mvu_rtl_possible(n): external_memmode = getCustomOp(n).get_nodeattr("mem_mode") in ["decoupled", "external"] return ( - act_width_in_range + inp_width_in_range and weight_width_in_range and folding_supported and targets_dsp @@ -255,4 +255,4 @@ def apply(self, model): # remove old nodes graph.node.remove(node) graph_modified = True - return (model, graph_modified) + return (model, graph_modified) \ No newline at end of file From 8986c23b890323cd3d044a53418211bd9ce27cb7 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 7 Mar 2024 15:40:07 +0000 Subject: [PATCH 201/291] [rtlbackend]: added additional parameters to generate_hdl --- src/finn/custom_op/fpgadataflow/rtlbackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py index 96deb49161..be2a9e75c1 100644 --- a/src/finn/custom_op/fpgadataflow/rtlbackend.py +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -54,7 +54,7 @@ def code_generation_ipi(self): pass def code_generation_ipgen(self, model, fpgapart, clk): - self.generate_hdl() + self.generate_hdl(model, fpgapart, clk) # TODO: Implement alternative def hls_sname(self): From ee5312ed3914b094d3c2ed0a4a0508e6e0f37051 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 7 Mar 2024 15:42:11 +0000 Subject: [PATCH 202/291] [rtl op]: extended generate_hdl argument list --- .../fpgadataflow/rtl/convolutioninputgenerator_rtl.py | 2 +- src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py | 2 +- .../custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py | 3 --- .../fpgadataflow/rtl/streamingdatawidthconverter_rtl.py | 2 +- src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py | 2 +- 5 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py index 08564ca6da..35026a169c 100755 --- a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py @@ -839,7 +839,7 @@ def select_impl_style(self): return impl_style - def generate_hdl(self): + def generate_hdl(self, model, fpgapart, clk): """Generates HDL code and wrapper for the IP, depending on required implementation style.""" impl_style = self.select_impl_style() diff --git a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py index 19765d64c4..cc49446ea3 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py @@ -171,7 +171,7 @@ def get_dynamic_config(self, ifm_dims=None, pads=None): } return config - def generate_hdl(self): + def generate_hdl(self, model, fpgapart, clk): rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fmpadding/hdl" template_path = rtlsrc + "/fmpadding_template.v" dims = self.get_nodeattr("ImgDim") diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index 4f17aab5fd..c50ca52077 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -145,9 +145,6 @@ def dsp_estimation(self): dsp_res["DSP58"] = P * np.ceil(Q / 3) return dsp_res - def code_generation_ipgen(self, model, fpgapart, clk): - self.generate_hdl(model, fpgapart, clk) - def instantiate_ip(self, cmd): # instantiate the RTL IP code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py index ef918b5db8..e79782eb6d 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py @@ -137,7 +137,7 @@ def get_template_values(self): } return code_gen_dict - def generate_hdl(self): + def generate_hdl(self, model, fpgapart, clk): rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/dwc/hdl" template_path = rtlsrc + "/dwc_template.v" code_gen_dict = self.get_template_values() diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py index a9d9e689eb..dfae607622 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py @@ -82,7 +82,7 @@ def get_verilog_top_module_intf_names(self): ret["ap_none"] = ["maxcount"] return ret - def generate_hdl(self): + def generate_hdl(self, model, fpgapart, clk): rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fifo/hdl" template_path = rtlsrc + "/fifo_template.v" From b69a0fdc9290aa5b770d2a42daaf8b017c023c90 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 7 Mar 2024 15:42:53 +0000 Subject: [PATCH 203/291] [rtlbackend]: extended argument list of abstractmethod accordingly --- src/finn/custom_op/fpgadataflow/rtlbackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py index be2a9e75c1..8a81b7028b 100644 --- a/src/finn/custom_op/fpgadataflow/rtlbackend.py +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -42,7 +42,7 @@ def get_nodeattr_types(self): } @abstractmethod - def generate_hdl(self): + def generate_hdl(model, fpgapart, clk): pass @abstractmethod From 366db07511f92c636abec65bd071d4da558c1543 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Mar 2024 10:28:30 +0000 Subject: [PATCH 204/291] [mvau]: renamed method to more generic name --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 138cd9f3ad..e1e098e676 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -593,7 +593,7 @@ def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): + def get_hw_compatible_weight_tensor(self, orig_weight_matrix): """Convert the original numpy weight matrix orig_weight_matrix into a form suitable for passing to the hlslib call: * ensure MH % PE == 0 and MW % SIMD == 0 @@ -644,7 +644,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): """ # convert weights into hlslib-compatible format - weight_tensor = self.get_hls_compatible_weight_tensor(weights) + weight_tensor = self.get_hw_compatible_weight_tensor(weights) export_wdt = self.get_weight_datatype() # we have converted bipolar weights to binary for export, # so use it as such for weight generation From 4334dd95328d69b615b70ecb60efd350c5633b70 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Mar 2024 10:30:53 +0000 Subject: [PATCH 205/291] minor fix to abstractmethod parameters --- src/finn/custom_op/fpgadataflow/rtlbackend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py index 8a81b7028b..2e4d647b22 100644 --- a/src/finn/custom_op/fpgadataflow/rtlbackend.py +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -42,7 +42,7 @@ def get_nodeattr_types(self): } @abstractmethod - def generate_hdl(model, fpgapart, clk): + def generate_hdl(self, model, fpgapart, clk): pass @abstractmethod From 092979886335a939067779019e27cd0f545635ca Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Mar 2024 12:37:25 +0000 Subject: [PATCH 206/291] minor fix to comment --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index e1e098e676..b8dba2f9d1 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -643,7 +643,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): * weight_file_name : filename for the weight file to be generated """ - # convert weights into hlslib-compatible format + # convert weights into hlslib/rtllib-compatible format weight_tensor = self.get_hw_compatible_weight_tensor(weights) export_wdt = self.get_weight_datatype() # we have converted bipolar weights to binary for export, From eb6e0ae0cf04fd41c156270e6132c2c2cc27bec9 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Mar 2024 12:38:29 +0000 Subject: [PATCH 207/291] [test]: cleaned up test and minor modifications for supporting RTL-op --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 43 ++++++++++++++------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 03f1293b74..6c2940f8f7 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -312,7 +312,7 @@ def test_fpgadataflow_mvau_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): inst.set_nodeattr("mem_mode", mem_mode) # Note: only HLS-based MVAU layers execute CPPsim inst.set_nodeattr("preferred_impl_style", "hls") - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) @@ -402,7 +402,6 @@ def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) - inst.set_nodeattr("rtlsim_trace", "mvau_trace.vcd") inst.set_nodeattr("preferred_impl_style", "hls") # prepare input data @@ -424,13 +423,12 @@ def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) - model.save("mvau_rtl.onnx") y_produced = oxe.execute_onnx(model, input_dict)["outp"] assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" @@ -449,7 +447,7 @@ def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # mem_mode: const or decoupled @pytest.mark.parametrize("mem_mode", ["decoupled"]) # activation: None or DataType -@pytest.mark.parametrize("act", [DataType["INT4"]]) +@pytest.mark.parametrize("act", [None, DataType["INT4"]]) # weight datatype @pytest.mark.parametrize("wdt", [DataType["INT4"]]) # input datatype @@ -462,11 +460,15 @@ def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.parametrize("mw", [128]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [128]) +# Backend +@pytest.mark.parametrize("preferred_impl_style", ["hls", "rtl"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( - mem_mode, idt, wdt, act, nf, sf, mw, mh + mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style ): + if preferred_impl_style == "rtl" and act is not None: + pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations") if nf == -1: nf = mh if sf == -1: @@ -507,6 +509,8 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("resType", "auto") + inst.set_nodeattr("preferred_impl_style", preferred_impl_style) # prepare input data input_dict = prepare_inputs(x, idt, wdt) @@ -527,7 +531,9 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -537,9 +543,10 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MVAU_hls_0" in hls_synt_res_est + if preferred_impl_style == "hls": + assert "MVAU_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("MVAU_hls")[0] + node = model.get_nodes_by_op_type("MVAU")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) @@ -551,7 +558,7 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( # mem_mode: const or decoupled @pytest.mark.parametrize("mem_mode", ["decoupled", "const"]) # activation: None or DataType -@pytest.mark.parametrize("act", [DataType["INT4"]]) +@pytest.mark.parametrize("act", [None, DataType["INT4"]]) # weight datatype @pytest.mark.parametrize("wdt", [DataType["INT4"]]) # input datatype @@ -564,9 +571,15 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( @pytest.mark.parametrize("mw", [32]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [32]) +# Backend +@pytest.mark.parametrize("preferred_impl_style", ["hls", "rtl"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_mvau_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_mvau_fifocharacterize_rtlsim( + mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style +): + if preferred_impl_style == "rtl" and (mem_mode == "const" or act is not None): + pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations") if nf == -1: nf = mh if sf == -1: @@ -591,9 +604,13 @@ def test_mvau_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("resType", "auto") + inst.set_nodeattr("preferred_impl_style", preferred_impl_style) total_fold = nf * sf exp_total_cycles = total_fold + 10 - model = model.transform(SpecializeLayers()) + model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -608,7 +625,7 @@ def test_mvau_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert chrc_in.shape == (1, 2 * exp_total_cycles) assert chrc_out.shape == (1, 2 * exp_total_cycles) # first sf cycles should read input continuously - assert (chrc_in[0, :sf] == range(1, sf + 1)).all() + assert (chrc_in[0, :sf] == list(range(1, sf + 1))).all() # all outputs should be produced within the exp n of cycles assert chrc_out[0, exp_total_cycles] == nf From 5f7e9aea10d696b9dc478bee5af13a56a69c6aaa Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Mar 2024 12:43:17 +0000 Subject: [PATCH 208/291] [test]: minor change to get_nodes_by_op_type call --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 6c2940f8f7..9254c7ac5a 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -546,7 +546,10 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( if preferred_impl_style == "hls": assert "MVAU_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("MVAU")[0] + if preferred_impl_style == "hls": + node = model.get_nodes_by_op_type("MVAU_hls")[0] + else: + node = model.get_nodes_by_op_type("MVAU_rtl")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) From 4bb2e88c73fab25671f103edf791a1dddbed20f5 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Mar 2024 12:43:27 +0000 Subject: [PATCH 209/291] updated PyVerilator commit hash --- fetch-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch-repos.sh b/fetch-repos.sh index 1275ccf31c..119d3f1172 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -30,7 +30,7 @@ QONNX_COMMIT="47e4357faf66b5b0d1bf77bf908bb47752421e5b" FINN_EXP_COMMIT="de99347e936d51715f5356a1b6c64e37b91c23c2" BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db" -PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" +PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3" OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a" From dbd715da145d0b73ff8c51b512f30416ae413290 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Mar 2024 13:32:12 +0000 Subject: [PATCH 210/291] [rtl mvau]: updated DSP resource estimates --- .../rtl/matrixvectoractivation_rtl.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index c50ca52077..dccdc67d00 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -34,7 +34,6 @@ from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -from finn.util.fpgadataflow import is_versal try: from pyverilator import PyVerilator @@ -57,7 +56,10 @@ def __init__(self, onnx_node, **kwargs): super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = {} + my_attrs = { + # Flag to indicate if Versal device is targeted + "is_versal": ("i", False, 0, {0, 1}), + } my_attrs.update(MVAU.get_nodeattr_types(self)) my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs @@ -140,10 +142,11 @@ def dsp_estimation(self): # multiplication P = self.get_nodeattr("PE") Q = self.get_nodeattr("SIMD") - dsp_res = {} - dsp_res["DSP48"] = np.ceil(P / 4) * Q - dsp_res["DSP58"] = P * np.ceil(Q / 3) - return dsp_res + if self.get_nodeattr("is_versal"): + mult_dsp = P * np.ceil(Q / 3) + else: + mult_dsp = np.ceil(P / 4) * Q + return int(mult_dsp) def instantiate_ip(self, cmd): # instantiate the RTL IP @@ -196,7 +199,7 @@ def _resolve_impl_style(self, fpgapart): act_width = self.get_input_datatype(0).bitwidth() weight_width = self.get_input_datatype(1).bitwidth() - is_versal_family = is_versal(fpgapart) + is_versal_family = self.get_nodeattr("is_versal") if is_versal_family: return "mvu_vvu_8sx9_dsp58" From 8859d81970465b0912b13cfe4b910ccb8aa89913 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Mar 2024 13:32:50 +0000 Subject: [PATCH 211/291] [transform]: added additional check for rtl-MVAU and added is_versal node attribute for rtl-MVAU --- .../transformation/fpgadataflow/specialize_layers.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 29921a97f5..6349bdb713 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -34,6 +34,7 @@ from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants +from finn.util.fpgadataflow import is_versal def _determine_impl_style(node, fpgapart=""): @@ -114,9 +115,6 @@ def _determine_impl_style(node, fpgapart=""): return "rtl" elif optype == "MVAU": if _mvu_rtl_possible(node): - if getCustomOp(node).get_nodeattr("noActivation") == 0: - # Split thresholding - pass return "rtl" else: warn_str = """There is no RTL variant for %s. The node will automatically be @@ -212,6 +210,7 @@ def _mvu_rtl_possible(n): ) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0) targets_dsp = getCustomOp(n).get_nodeattr("resType") in ["dsp", "auto"] external_memmode = getCustomOp(n).get_nodeattr("mem_mode") in ["decoupled", "external"] + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 return ( inp_width_in_range @@ -219,6 +218,7 @@ def _mvu_rtl_possible(n): and folding_supported and targets_dsp and external_memmode + and no_activation ) @@ -251,8 +251,11 @@ def apply(self, model): for attribute in node.attribute: if attribute.name != "preferred_impl_style": new_node.attribute.append(attribute) + is_versal_family = is_versal(self.fpgapart) + if new_node.op_type == "MVAU_rtl": + getCustomOp(new_node).set_nodeattr("is_versal", is_versal_family) graph.node.insert(node_ind, new_node) # remove old nodes graph.node.remove(node) graph_modified = True - return (model, graph_modified) \ No newline at end of file + return (model, graph_modified) From f61ced89b4a915a397932ac60ea392a8485ea0e1 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 8 Mar 2024 15:06:25 +0000 Subject: [PATCH 212/291] [Tests] Set mem_mode only if impl_style=hls for thresholding --- tests/fpgadataflow/test_fpgadataflow_thresholding.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index fc3996ddab..8dee95fa82 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -139,6 +139,14 @@ def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp @pytest.mark.vivado @pytest.mark.slow def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem_mode): + # the mem_mode parameter can only be used for the hls thresholding + # so the test will only be executed once for impl_style=rtl and once skipped + # when the mem_mode is varied. Otherwise, the same test configuration would always + # run twice. + if impl_style == "rtl" and mem_mode == "internal_decoupled": + pytest.skip( + "Skip, because test is identical to impl_style=rtl and mem_mode=internal_embedded" + ) if nf == -1: nf = ich pe = ich // nf @@ -199,7 +207,8 @@ def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem node = model.graph.node[0] inst = getCustomOp(node) inst.set_nodeattr("PE", pe) - inst.set_nodeattr("mem_mode", mem_mode) + if impl_style == "hls": + inst.set_nodeattr("mem_mode", mem_mode) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) From 5fe519d966ed33947863d515c1262cb3aada4007 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 8 Mar 2024 15:27:59 +0000 Subject: [PATCH 213/291] [Thresholding] Rename mem mode to internal_decoupled --- src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index 96d5f2d8b9..b753bc7a03 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -133,7 +133,7 @@ def get_weightstream_width_padded(self): def get_ap_int_max_w(self): ap_int_max_w = HLSBackend.get_ap_int_max_w(self) - if self.get_nodeattr("mem_mode") == "decoupled": + if self.get_nodeattr("mem_mode") == "internal_decoupled": weightstream = self.get_weightstream_width() ap_int_max_w = max([weightstream, ap_int_max_w]) return ap_int_max_w From b67281b758b463457a079eb19664b4ef8f7afff5 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Mar 2024 15:47:17 +0000 Subject: [PATCH 214/291] [transform]: add default empty string to fpgapart --- src/finn/transformation/fpgadataflow/specialize_layers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 6349bdb713..1563ef83ca 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -37,7 +37,7 @@ from finn.util.fpgadataflow import is_versal -def _determine_impl_style(node, fpgapart=""): +def _determine_impl_style(node): optype = node.op_type # check if there is an HLS or RTL variant or both @@ -225,7 +225,7 @@ def _mvu_rtl_possible(n): class SpecializeLayers(Transformation): """Specialize all layers to either HLS or RTL variants""" - def __init__(self, fpgapart): + def __init__(self, fpgapart=""): super().__init__() self.fpgapart = fpgapart @@ -251,8 +251,8 @@ def apply(self, model): for attribute in node.attribute: if attribute.name != "preferred_impl_style": new_node.attribute.append(attribute) - is_versal_family = is_versal(self.fpgapart) if new_node.op_type == "MVAU_rtl": + is_versal_family = is_versal(self.fpgapart) getCustomOp(new_node).set_nodeattr("is_versal", is_versal_family) graph.node.insert(node_ind, new_node) # remove old nodes From dfc1b208c8f7d22ef80da1309de96cd2ac3cabd6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Mar 2024 15:48:18 +0000 Subject: [PATCH 215/291] [transform]: minor fix to how fpgapart is propagated --- src/finn/transformation/fpgadataflow/specialize_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 1563ef83ca..a8100a36d7 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -238,7 +238,7 @@ def apply(self, model): if not node.domain == "finn.custom_op.fpgadataflow": continue node_ind += 1 - impl_style = _determine_impl_style(node, self.fpgapart) + impl_style = _determine_impl_style(node) optype = node.op_type + "_" + impl_style new_node = helper.make_node( From e1a18c717b9e733314cd464fe75232f86bb5bc30 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 8 Mar 2024 15:48:22 +0000 Subject: [PATCH 216/291] [Tests] Update runtime thresholding test with new mem mode --- .../test_fpgadataflow_thresholding.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 8dee95fa82..a6e7e41596 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -245,7 +245,7 @@ def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem @pytest.mark.parametrize("impl_style", ["rtl", "hls"]) # configuration (ch, pe) -@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 2), (8, 4)]) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 4)]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_runtime_thresholds_read(impl_style, cfg): @@ -259,7 +259,7 @@ def test_runtime_thresholds_read(impl_style, cfg): ch = cfg[0] pe = cfg[1] n_inp_vecs = [1, 2, 2] - mem_mode = "internal_decoupled" + hls_mem_mode = "internal_decoupled" act = DataType["INT4"] idt = DataType["INT16"] odt = act @@ -274,9 +274,7 @@ def test_runtime_thresholds_read(impl_style, cfg): else: actval = odt.min() - model = make_single_thresholding_modelwrapper( - impl_style, T, pe, idt, odt, actval, mem_mode, n_inp_vecs - ) + model = make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs) model = model.transform(SpecializeLayers()) # Make sure that specialize layer did not default to HLS implementation @@ -284,6 +282,9 @@ def test_runtime_thresholds_read(impl_style, cfg): node = model.get_nodes_by_op_type(f"Thresholding_{impl_style}")[0] op_inst = getCustomOp(node) + op_inst.set_nodeattr("PE", pe) + if impl_style == "hls": + op_inst.set_nodeattr("mem_mode", hls_mem_mode) op_inst.set_nodeattr("runtime_writeable_weights", 1) dat_fname = f"old_weights_{cfg}.dat" @@ -343,7 +344,7 @@ def read_weights(sim): @pytest.mark.parametrize("impl_style", ["hls", "rtl"]) # configuration (ch, pe) -@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 2), (8, 4)]) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 4)]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_runtime_thresholds_write(impl_style, cfg): @@ -361,7 +362,7 @@ def test_runtime_thresholds_write(impl_style, cfg): pe = cfg[1] n_inp_vecs = [1, 2, 2] - mem_mode = "decoupled" + hls_mem_mode = "internal_decoupled" act = DataType["INT4"] idt = DataType["INT16"] @@ -377,15 +378,16 @@ def test_runtime_thresholds_write(impl_style, cfg): else: actval = odt.min() - model = make_single_thresholding_modelwrapper( - impl_style, T_init, pe, idt, odt, actval, mem_mode, n_inp_vecs - ) + model = make_single_thresholding_modelwrapper(impl_style, T_init, idt, odt, actval, n_inp_vecs) model = model.transform(SpecializeLayers()) # Validate that specialize layer did not default to HLS implementation assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) op_inst = getCustomOp(model.graph.node[0]) + op_inst.set_nodeattr("PE", pe) + if impl_style == "hls": + op_inst.set_nodeattr("mem_mode", hls_mem_mode) op_inst.set_nodeattr("runtime_writeable_weights", 1) # Make new weights for runtime write From 62b1655a5322d3d37898ea5b42cc04506a9e81a7 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 8 Mar 2024 16:09:35 +0000 Subject: [PATCH 217/291] [transform]: minor fix to infer right MVAU type --- src/finn/transformation/fpgadataflow/specialize_layers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index a8100a36d7..94c0a87c03 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -54,6 +54,11 @@ def _determine_impl_style(node): if optype == "StreamingDataWidthConverter": return _dwc_determine_impl_style(node) if rtl_variant: + if optype == "MVAU": + if _mvu_rtl_possible(node): + return "rtl" + else: + return "hls" return "rtl" # but if no rtl variant, set impl_style to hls elif hls_variant: From f35dbf81a5fee7191fd7c507b8c614ac6548bde7 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 8 Mar 2024 16:26:08 +0000 Subject: [PATCH 218/291] [Tests] Remove mem_mode from conversion to hw in end2end tests --- tests/end2end/test_end2end_bnn_pynq.py | 8 ++++++-- tests/end2end/test_end2end_mobilenet_v1.py | 3 +-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 0fab1b298e..e90c412dae 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -130,6 +130,7 @@ def fold_tfc(model): fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") # set parallelism for input quantizer to be same as first layer's SIMD inp_qnt_node = model.get_nodes_by_op_type("Thresholding_hls")[0] inp_qnt = getCustomOp(inp_qnt_node) @@ -154,6 +155,7 @@ def fold_lfc(model): fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) fcl_inst.set_nodeattr("runtime_writeable_weights", 1) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") # set parallelism for input quantizer to be same as first layer's SIMD inp_qnt_node = model.get_nodes_by_op_type("Thresholding_hls")[0] inp_qnt = getCustomOp(inp_qnt_node) @@ -179,6 +181,7 @@ def fold_cnv_large(model): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") for i in range(len(swg_layers)): @@ -207,6 +210,7 @@ def fold_cnv_small(model): fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") for i in range(len(swg_layers)): @@ -539,9 +543,9 @@ def test_convert_to_hw_layers(self, topology, wbits, abits, board): # use standalone thresholds for tfc-w1a1 to also exercise that option model = model.transform(to_hw.InferThresholdingLayer()) # needed for bipolar MatMul layers - model = model.transform(to_hw.InferBinaryMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) # needed for non-bipolar MatMul layers - model = model.transform(to_hw.InferQuantizedMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) # TopK to LabelSelect model = model.transform(to_hw.InferLabelSelectLayer()) # input quantization (if any) to standalone thresholding diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index abd019c7bc..eec303d29e 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -92,7 +92,6 @@ test_platform = alveo_default_platform[test_board] test_fpga_part = alveo_part_map[test_board] target_clk_ns = 3 -mem_mode = "internal_decoupled" large_fifo_ram_style = "ultra" extra_fold = 1 first_layer_res_type = "dsp" @@ -226,7 +225,7 @@ def test_end2end_mobilenet_convert_to_hw_layers(): model = model.transform(to_hw.InferPool()) model = model.transform(to_hw.InferConvInpGen()) model = model.transform(to_hw.InferVectorVectorActivation()) - model = model.transform(to_hw.InferQuantizedMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) model = model.transform(to_hw.InferChannelwiseLinearLayer()) model = model.transform(to_hw.InferLabelSelectLayer()) model = model.transform(InferShapes()) From c9b1d3782ee9c39daf59fa7fcc0d334072a57ea4 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 11 Mar 2024 10:01:59 +0000 Subject: [PATCH 219/291] [Tests] Extend check to cover all cases for cppsim rtl swg --- tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 7482b789a9..02aaf85851 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -220,7 +220,7 @@ def test_fpgadataflow_slidingwindow( # if cppsim and impl style rtl is selected, the node execution is done by the hw op parent # so, no reordering/shaping of the output is needed # because there is no concept of SIMD parallelism in the hw abstraction layer execution - if dw == 0 or (impl_style == "rtl" and exec_mode == "cppsim"): + if dw == 0 or (optype == "ConvolutionInputGenerator_rtl" and exec_mode == "cppsim"): assert (y_produced == y_expected).all() else: y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd) From 96712fdb4fd4c222301754f4ad8dac9e3c00a710 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 11 Mar 2024 10:45:33 +0000 Subject: [PATCH 220/291] [RTL Thresholding] Temporarily defaulting to HLS variant in conversion --- src/finn/transformation/fpgadataflow/set_folding.py | 1 + src/finn/transformation/fpgadataflow/specialize_layers.py | 4 ++++ tests/fpgadataflow/test_convert_to_hw_layers_cnv.py | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index bff64d3885..a755d37a9d 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -105,6 +105,7 @@ def apply(self, model): "DuplicateStreams_hls", "GlobalAccPool_hls", "Thresholding_hls", + "Thresholding_rtl", ] # these ops use SIMD parallelism, up to a max value of NumChannels # ConvolutionInputGenerator* has a special case when depthwise=1 diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index a8e8fc72c1..7b8545db84 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -51,6 +51,10 @@ def _determine_impl_style(node): if impl_style == "": if optype == "StreamingDataWidthConverter": return _dwc_determine_impl_style(node) + # TODO extensively test RTL thresholding + # for now use HLS component for thresholding + if optype == "Thresholding": + return "hls" if rtl_variant: return "rtl" # but if no rtl variant, set impl_style to hls diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py index 96e945d083..ff61867fde 100644 --- a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py @@ -101,6 +101,10 @@ def test_convert_to_hw_layers_cnv_w1a1(fused_activation): # subsequently, the FC inference will generate passthrough MVAUs if not fused_activation: model = model.transform(to_hw.InferThresholdingLayer()) + tr_nodes = model.get_nodes_by_op_type("Thresholding") + for tr in tr_nodes: + tr_inst = getCustomOp(tr) + tr_inst.set_nodeattr("preferred_impl_style", "hls") model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) model = model.transform(to_hw.InferConvInpGen()) From acd4c5523efceb195b3add69ee1dfe97cacbabf2 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 11 Mar 2024 13:19:50 +0000 Subject: [PATCH 221/291] [NBs] Update tfc end2end notebooks to reflect new flow --- notebooks/advanced/cybsec_PE_SIMD.onnx | Bin 192077 -> 191973 bytes .../bnn-pynq/finn-design-flow-example.svg | 2 +- .../bnn-pynq/tfc_end2end_example.ipynb | 149 ++++++++++++------ .../bnn-pynq/tfc_end2end_verification.ipynb | 45 +++--- .../end2end_example/bnn-pynq/verification.png | Bin 55982 -> 0 bytes .../end2end_example/bnn-pynq/verification.svg | 1 + 6 files changed, 132 insertions(+), 65 deletions(-) delete mode 100755 notebooks/end2end_example/bnn-pynq/verification.png create mode 100755 notebooks/end2end_example/bnn-pynq/verification.svg diff --git a/notebooks/advanced/cybsec_PE_SIMD.onnx b/notebooks/advanced/cybsec_PE_SIMD.onnx index d09d07d2bf1b502d93bc676c8901fdc29de51d6b..8d42b2e37b16e42012e4d29fa365388e70537eef 100644 GIT binary patch delta 81 zcmX?mhx_Sm?g=7nTW@o7y=0mw^>p%XR^7>Nj1f$qm?ke^e22`{XNGf`Uovq`E@T#) We3j))^KRDm-K>n;ce650dI11K?;q>{ delta 164 zcmaEQoBQk??g=7nbMJ6-H8W3?dMYNt#gUtu8=sq>lB&d+lA4@fT9A{PGP!`MleveP vYhr-d + diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb index a5c97328a5..bbaa74dbff 100644 --- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb @@ -33,7 +33,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n", + "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) to bring the network into a form in which each layer can be represented by either a Vitis HLS function or a Verilog module. The model then gets passed to Vivado IPI stitching (orange section), and finally a PYNQ overlay bitfile is built and can be tested on a PYNQ board (yellow section).\n", "There is an additional section for functional verification (red section) on the right side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n", "\n", "\n", @@ -114,7 +114,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. 'ModelWrapper' is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN. The model was exported in QONNX format, to feed it into the FINN flow, our first step is to convert it to the FINN-ONNX format." + "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. `ModelWrapper` is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN. The model was exported in QONNX format, to feed it into the FINN flow, our first step is to convert it to the FINN-ONNX format." ] }, { @@ -129,6 +129,23 @@ "model = model.transform(ConvertQONNXtoFINN())" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the conversion we save the model and visualize it using Netron. As you can see, quantization is now expressed differently. Where we had Quant nodes before, there are now MultiThreshold nodes present in the graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.save(build_dir+\"/tfc_w1_a1_finn.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1_a1_finn.onnx\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -149,8 +166,9 @@ "* [FINN-style Dataflow Architectures](#dataflow_arch)\n", "* [Tidy-up transformations](#basic_trafo)\n", "* [Streamlining](#streamline)\n", - "* [Conversion to HLS layers](#hls_layers)\n", + "* [Conversion to HW layers](#hw_layers)\n", "* [Creating a Dataflow Partition](#dataflow_partition)\n", + "* [Specialize layers](#specialize_layers)\n", "* [Folding and Datawidth Converter, FIFO and TLastMarker Insertion](#folding)\n", "\n", "\n", @@ -167,7 +185,7 @@ "\n", "![](finn-hw-arch.png)\n", "\n", - "In practice, the compute arrays are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library. As these function calls can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls, which is the goal of the network preparation process." + "In practice, the compute arrays are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library or by Verilog modules from the [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib). As these function calls/modules can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls/modules, which is the goal of the network preparation process." ] }, { @@ -254,7 +272,7 @@ "\n", "In FINN, we can bake some of these pre/postprocessing operatings into the graph, and in some cases these can be highly beneficial for performance by allowing our accelerator to directly consume raw data instead of going through CPU preprocessing. \n", "\n", - "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L86), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use." + "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L93), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use." ] }, { @@ -407,32 +425,25 @@ "model = model.transform(InferDataLayouts())\n", "model = model.transform(RemoveUnusedTensors())\n", "\n", - "model.save(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")\n", - "showInNetron(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")" + "model.save(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Observe the pairs of `XnorPopcountmatMul` and `MultiThreshold` layers following each other -- this is the particular pattern that the next step will be looking for in order to convert them to HLS layers." + "Observe the pairs of `XnorPopcountmatMul` and `MultiThreshold` layers following each other -- this is the particular pattern that the next step will be looking for in order to convert them to hardware (HW) layers." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Conversion to HLS layers \n", - "Converts the nodes to HLS layers that correspond to the functions in [finn-hls library](https://finn-hlslib.readthedocs.io/en/latest/). In our case this transformation converts pairs of binary XnorPopcountMatMul layers to MatrixVectorActivation layers. Any immediately following MultiThreshold layers will also be absorbed into the MVTU.\n", + "### Conversion to HW layers \n", + "Converts the nodes to HW layers, these layers are abstraction layers that do not directly correspond to an HLS or Verilog implementation but they will be converted in either one later in the flow. In our case this transformation converts pairs of binary XnorPopcountMatMul layers to MVAU layers (matrix vector activation unit). Any immediately following MultiThreshold layers will also be absorbed into the MVAU.\n", "\n", - "Below is the code for the transformation and the network is visualized using netron to create the new structure with `MatrixVectorActivation` nodes, which will correspond to a function call from the [finn-hlslib](https://finn-hlslib.readthedocs.io/en/latest/library/matrixvector.html) library." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note:** The transformation `to_hls.InferBinaryMatrixVectorActivation` gets the string \"decoupled\" as argument, this indicates the `mem_mode` for the weights. In FINN there are different options to set the way the weights are stored and accessed. For details please have a look on the [FINN readthedocs website](https://finn.readthedocs.io/) under Internals." + "Below is the code for the transformation and the network is visualized using netron to create the new structure with `MVAU` nodes." ] }, { @@ -441,22 +452,15 @@ "metadata": {}, "outputs": [], "source": [ - "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n", - "model = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")\n", - "model = model.transform(to_hls.InferBinaryMatrixVectorActivation(\"decoupled\"))\n", + "import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw\n", + "model = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")\n", + "model = model.transform(to_hw.InferBinaryMatrixVectorActivation())\n", "# TopK to LabelSelect\n", - "model = model.transform(to_hls.InferLabelSelectLayer())\n", + "model = model.transform(to_hw.InferLabelSelectLayer())\n", "# input quantization (if any) to standalone thresholding\n", - "model = model.transform(to_hls.InferThresholdingLayer())\n", - "model.save(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")\n", - "showInNetron(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Each MatrixVectorActivation node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance. We will shortly cover how these can be adjusted, but first we want to separate the HLS layers from the non-HLS layers in this network." + "model = model.transform(to_hw.InferThresholdingLayer())\n", + "model.save(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")" ] }, { @@ -465,7 +469,7 @@ "source": [ "### Creating a Dataflow Partition \n", "\n", - "In the graph above, you can see that there is a mixture of FINN HLS layers (MatrixVectorActivation and Thresholding_Batch) with one regular ONNX layers (Reshape). To create a bitstream, FINN needs a model with only HLS layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition." + "In the graph above, you can see that there is a mixture of FINN HW layers (`MVAU` and `Thresholding`) with one regular ONNX layers (Reshape). To create a bitstream, FINN needs a model with only HW layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition." ] }, { @@ -476,7 +480,7 @@ "source": [ "from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition\n", "\n", - "model = ModelWrapper(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")\n", + "model = ModelWrapper(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")\n", "parent_model = model.transform(CreateDataflowPartition())\n", "parent_model.save(build_dir+\"/tfc_w1_a1_dataflow_parent.onnx\")\n", "showInNetron(build_dir+\"/tfc_w1_a1_dataflow_parent.onnx\")" @@ -486,7 +490,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that the `MatrixVectorActivation` instances and the `Thresholding_Batch` in the beginning have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HLS dataflow-only graph:" + "We can see that the `MVAU` instances and the `Thresholding` in the beginning have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HW dataflow-only graph:" ] }, { @@ -506,7 +510,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see all the extracted `MatrixVectorActivation` instances and the `Thresholding_Batch` have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it." + "We can see all the extracted `MVAU` instances and the `Thresholding` have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it." ] }, { @@ -518,6 +522,60 @@ "model = ModelWrapper(dataflow_model_filename)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Specialize layers " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The network is converted to HW abstraction layers and we have excluded the non-HW layers to continue with the processing of the model. HW abstraction layers are abstract (placeholder) layers that can be either implemented in HLS or as an RTL module using FINN. In the next flow step, we convert each of these layers to either an HLS or RTL variant by calling the `SpecializeLayers` transformation. It is possible to let the FINN flow know a preference for the implementation style `{\"hls\", \"rtl\"}` and depending on the layer type this wish will be fulfilled or it will be set to a reasonable default. In the tfc example, we will set all layers to their HLS variants. To showcase how to set the preferred implementation, we will set the node attribute in the `Thresholding` layer to `\"hls\"`, for the `MVAUs` and the `LabelSelect` we will leave this node attribute empty and in this case by default it will be set to HLS." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "thresh_node = model.get_nodes_by_op_type(\"Thresholding\")[0]\n", + "thresh_node_inst = getCustomOp(thresh_node)\n", + "thresh_node_inst.set_nodeattr(\"preferred_impl_style\", \"hls\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we will call `SpecializeLayers` to convert each HW abstraction layer to (in this case) an HLS variant." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", + "model = model.transform(SpecializeLayers())\n", + "\n", + "model.save(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each node type has now a suffix (`_hls`) and the module (`\n", + "finn.custom_op.fpgadataflow.hls` also indicates that that the HLS variant of the layer is selected.\n", + "We can now proceed by adjusting the parallelism of each node to customize the performance and resource usage.)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -526,14 +584,17 @@ "\n", "*Folding* in FINN describes how much a layer is time-multiplexed in terms of execution resources. There are several *folding factors* for each layer, controlled by the PE (parallelization over outputs) and SIMD (parallelization over inputs) parameters as described by the original [FINN paper](https://arxiv.org/pdf/1612.07119). The higher the PE and SIMD values are set, the faster the generated accelerator will run, and the more FPGA resources it will consume. \n", "\n", - "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a MatrixVectorActivation operation. This is where the Netron visualization helps us, in the above diagram we can see that the model contains four MatrixVectorActivation. So as an example we extract the second node of the graph." + "Each MVAU_hls node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance. \n", + "\n", + "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a Matrix-Vector-Activation operation. This is where the Netron visualization helps us, in the above diagram we can see that the model contains four `MVAUs`. So as an example we extract the second node of the graph." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/main/src/finn/custom_op/fpgadataflow/hlscustomop.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes." + "We can use the higher-level CustomOp wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Above, we have already used this abstraction to set the node attribute of the Thresholding HW layer.\n", + "Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes." ] }, { @@ -564,7 +625,7 @@ "metadata": {}, "outputs": [], "source": [ - "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "fc_layers = model.get_nodes_by_op_type(\"MVAU_hls\")\n", "# (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer\n", "config = [\n", " (16, 49, [16], [64], \"block\"),\n", @@ -581,7 +642,7 @@ " fcl_inst.set_nodeattr(\"ram_style\", ramstyle)\n", " \n", "# set parallelism for input quantizer to be same as first layer's SIMD\n", - "inp_qnt_node = model.get_nodes_by_op_type(\"Thresholding_Batch\")[0]\n", + "inp_qnt_node = model.get_nodes_by_op_type(\"Thresholding_hls\")[0]\n", "inp_qnt = getCustomOp(inp_qnt_node)\n", "inp_qnt.set_nodeattr(\"PE\", 49)" ] @@ -658,7 +719,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In previous versions of FINN, we had to manually go through several steps to generate HLS code, stitch IP, create a PYNQ project and run synthesis. All these steps are now performed by the `ZynqBuild` transform (or the `VitisBuild` transform for Alveo). **As this involves calling HLS synthesis and Vivado synthesis, this transformation will run for some time (up to half an hour depending on your PC).**" + "In previous versions of FINN, we had to manually go through several steps to generate HLS/RTL code, stitch IP, create a PYNQ project and run synthesis. All these steps are now performed by the `ZynqBuild` transform (or the `VitisBuild` transform for Alveo). **As this involves calling HLS synthesis and Vivado synthesis, this transformation will run for some time (up to half an hour depending on your PC).**" ] }, { @@ -740,7 +801,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that `StreamingFIFO` and `StreamingDataWidthConverter` instances have been automatically inserted into the graph prior to hardware build. Transformations like `ZynqBuild` use the `metadata_props` of the model to put in additional metadata information relevant to the results of the transformation. Let's examine the metadata for the current graph containing all layers:" + "We can see that `StreamingFIFO` and `StreamingDataWidthConverter` instances have been automatically inserted into the graph prior to hardware build. Both layer types are inserted as RTL variants. Transformations like `ZynqBuild` use the `metadata_props` of the model to put in additional metadata information relevant to the results of the transformation. Let's examine the metadata for the current graph containing all layers:" ] }, { @@ -1014,9 +1075,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb index 2f6cde6e5b..a07a8d2254 100644 --- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb +++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb @@ -7,16 +7,16 @@ "# FINN - Functional Verification of End-to-End Flow\n", "-----------------------------------------------------------------\n", "\n", - "**Important: This notebook depends on the tfc_end2end_example notebook, because we are using models that are available at intermediate steps in the end-to-end flow. So please make sure the needed .onnx files are generated to run this notebook.**\n", + "**Important: This notebook depends on the [tfc_end2end_example](tfc_end2end_example.ipynb) notebook, because we are using models that are available at intermediate steps in the end-to-end flow. So please make sure the needed .onnx files are generated to run this notebook.**\n", "\n", - "In this notebook, we will show how to take the intermediate results of the end-to-end tfc example and verify their functionality with different methods. In the following picture you can see the section in the end-to-end flow about the *Simulation & Emulation Flows*. Besides the methods in this notebook, there is another one that is covered in the Jupyter notebook [tfc_end2end_example](tfc_end2end_example.ipynb): remote execution. The remote execution allows functional verification directly on the PYNQ board, for details please have a look at the mentioned Jupyter notebook." + "In this notebook, we will show how to take the intermediate results of the end-to-end tfc example and verify their functionality with different methods. In the following picture you can see the section in the end-to-end flow about the *Simulation & Emulation Flows*. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\"Drawing\"" + "\"Drawing\"" ] }, { @@ -72,9 +72,9 @@ "source": [ "## Simulation using Python \n", "\n", - "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow`) this model can be checked for functionality using Python.\n", + "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow.hls` or `backend` $\\neq$ `fpgadataflow.rtl`) this model can be checked for functionality using Python.\n", "\n", - "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of a XNOR popcount node.\n" + "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of an XNOR popcount node.\n" ] }, { @@ -95,7 +95,7 @@ "\n", "This execution function and onnxruntime is used when `execute_onnx` from `onnx_exec` is applied to the model. The model is then simulated node by node and the result is stored in a context dictionary, which contains the values of each tensor at the end of the execution. To get the result, only the output tensor has to be extracted.\n", "\n", - "The procedure is shown below. We take the model right before the nodes should be converted into HLS layers and generate an input tensor to pass to the execution function. The input tensor is generated from the Brevitas example inputs." + "The procedure is shown below. We take the model right before the nodes should be converted into HW layers and generate an input tensor to pass to the execution function. The input tensor is generated from the Brevitas example inputs." ] }, { @@ -108,7 +108,7 @@ "from qonnx.core.modelwrapper import ModelWrapper\n", "input_dict = {\"global_in\": nph.to_array(input_tensor)}\n", "\n", - "model_for_sim = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")" + "model_for_sim = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")" ] }, { @@ -141,7 +141,16 @@ "source": [ "## Simulation (cppsim) using C++\n", "\n", - "When dealing with HLS custom op nodes in FINN the simulation using Python is no longer sufficient. After the nodes have been converted to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in a .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding finn-hlslib function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS layers is used, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model." + "When dealing with HLS or RTL custom op nodes in FINN the simulation using Python is no longer sufficient. If the nodes are specialized to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in a .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding `finn-hlslib` function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS variants of the layers, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "Note: HW layer can also be converted to RTL variants, in this case \"cppsim\" is not an option we can execute. If nevertheless \"cppsim\" is selected as execution mode for the layer, the execution defaults to the parent class. Like this, networks with a mix of HLS and RTL layers can be executed using \"cppsim\" for the HLS layers. \n", + "
" ] }, { @@ -158,7 +167,7 @@ "metadata": {}, "source": [ "To generate the code for this simulation and to generate the executable two transformations are used:\n", - "* `PrepareCppSim` which generates the C++ code for the corresponding hls layer\n", + "* `PrepareCppSim` which generates the C++ code for the corresponding HLS layer\n", "* `CompileCppSim` which compules the C++ code and stores the path to the executable" ] }, @@ -280,9 +289,9 @@ "source": [ "## Emulation (rtlsim) using PyVerilator\n", "\n", - "The emulation using [PyVerilator](https://github.com/maltanar/pyverilator) can be done after IP blocks are generated from the corresponding HLS layers. Pyverilator is a tool which makes it possible to simulate verilog files using verilator via a python interface.\n", + "The emulation using [PyVerilator](https://github.com/maltanar/pyverilator) can be done after IP blocks are generated from the corresponding HLS layers or for RTL layers directly using the generated Verilog files. Pyverilator is a tool which makes it possible to simulate verilog files using verilator via a python interface.\n", "\n", - "We have two ways to use rtlsim, one is to run the model node-by-node as with the simulation methods, but if the model is in the form of the dataflow partition, the part of the graph that consist of only HLS nodes could also be executed as whole." + "We have two ways to use rtlsim, one is to run the model node-by-node as with the simulation methods, but if the model is in the form of the dataflow partition, the part of the graph that consist of only HLS/RTL nodes could also be executed as whole." ] }, { @@ -380,18 +389,14 @@ "source": [ "from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n", "from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO\n", + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", "from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP\n", "\n", "child_model = ModelWrapper(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")\n", - "child_model = child_model.transform(InsertDWC())\n", - "\n", - "# set all impl_styles of the DWCs to hls to enable emulation\n", - "dwc_nodes = child_model.get_nodes_by_op_type(\"StreamingDataWidthConverter_Batch\")\n", - "for dwc in dwc_nodes:\n", - " dwc_inst = getCustomOp(dwc)\n", - " dwc_inst.set_nodeattr(\"impl_style\", \"hls\")\n", - " \n", + "child_model = child_model.transform(InsertDWC()) \n", "child_model = child_model.transform(InsertFIFO(create_shallow_fifos=True))\n", + "# DWC and FIFOs need to be specialized to either HLS or RTL variants\n", + "child_model = child_model.transform(SpecializeLayers())\n", "child_model.save(build_dir + \"/test.onnx\");\n", "child_model = child_model.transform(GiveUniqueNodeNames())\n", "child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))\n", @@ -455,7 +460,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/end2end_example/bnn-pynq/verification.png b/notebooks/end2end_example/bnn-pynq/verification.png deleted file mode 100755 index cb50ba1b67508b45322f6b86bfcbcfb02d3cc9d5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 55982 zcmdS>g;$i_`v(dSJpuv>NS8&^<7`_xOB& z?>Ya%aV=fTz3*LD?avi^zAMU0K1U};2Z2D(r9O#&27!=}fxq5hRG@`vru7g6@&ieU zi>SCQ9Hn72%NfIZMpsNTppP3oJoY-GpMFW_em3VYZ!FB>P)bh3%Y9=^&lY-|gBLUL zO(NJA&oEZ1Fi0env`**s4U}f+zTA}eaOlILU~fZ)_nlzI=uPK*Ld~qlte&ONsE6-K z)8D^1qF}0^{}23=b*|#V%p|LoqGLEQ|6BRk@Qg!An0~Ml+goY0ON{;B8NQVfCtrS& zmb!b*#&%F~=i3Q!$&i(j5&EH*d$9h&lzVRVwi)AE*KHp^WO2o9(cf}$*{j55`935k zJ3G3!m+ok|ucWm=E3)M5I5bo!>BQRnmwb#`J%2-9TS&ub9vw}7NpyJhYK}Ki#t)}@ z6G$VJgj2vbntL~O(aKfJqerl1#OraUD>l|$X?#3qxQ$|XIpH65>A`X46I5^Cx__3n zS}L2mWI(fpJuZg?M|vzKHhEQ5x^4QuzUZH2j8I>J_3bVDhT3cAdn$05EnOt zm+I%6U@ykT=5ux?j*I?-8xxbEASI`be|$z-u9?5$pW^crqK_uXJ>cehPtY@TR~*(% zK~T16D4SH(w~(NK>9bP*SXKsQD>j8hg@*Fukq`(?K9uM+biDXSHxvtSXha0L1@V=$ z(m1OmOWS?pqr!9V`CGT+iLWSr*{{VGvMwI!HaBpijE&Uvi3t+Mx^+U#T*M0T$%x3j zP2-BDU*(w^k=LJO3DWg?-4?RiZr3`@M^_P)vf|HPwP#l@$J8Bp#Ymv6{Og{~YAE$rMACH03!=M9Jk!daRQG^})%ehv|9#{8xK`1U(- zUY==0S-Ps@80^p6{!rR1(USc2OS!JQn`WikhCyZg#rXJ5p8BxP%EN}5_G;4o z{P%&Ckh_2t{oXV)C%p@@7cYJ`4LL)Id)kfu>B_5WZ!{g64t70rV@iDWTmZ$j_fjjDO{awf@sL29Yy=9OP!S zKiBRa^;!0)s_eyW#1i{b8q(6z+$ac<&l9VR0#{GO=&1E5$~(3^`G5Xoz!=>5Ar2R* zA|B50?~#k=O^bYF&w?n|EY}X4xtD$TsaVZjjbw*PsT#4XCMVc;^v8GV&^R5HGa+K8 zZ^s{7?vKMGA{K0ICfD8{tNO$BUZr(n>4~+nGKABPTT$-!uy%|oQO&^=UvhFh%zCac z#rRtQzf@si^mLK#4MCg|%DtddaP!P;?c$3UFRob^O z`}cv+jg4s9+~52!wxAUT7I6_Lo=F#cItWEvh&H$FV7uOvYNik zTF$wp3~nbA_sk)}d6D&%QIMAga#2OXCHS&0P+sP<=+yeLPv!KTWrP3ZgZWs(3ZQd6}mJ>QYWXM`+-!xQv(6jz;q&Ok{IqA*w z|t_vFu4KOl^yMNb~sNA%LQKV3LcQ;*PWS(0yk4yFo5`sFu5{DDz zA4xwFAJ`2ttvt|^x!U&Vs^og@)P#Yh6#ELK;~2=;*orBXnK<(*kZ<1J4$9COZf;4O}Xb6NcC0V;?L_BqN9NhYYAin0 z#-C)b)?DLCzZmy+cT;!+RyeWa^H{@TXYa-a?o$kutn__uuIW(gu0k}aDMjpK`&nN> zqJRnRYS&vtxHgj-?<9LA!3gB?lD6t~%5t7P`%W@qGv_8XI9Q&%T7CBsqb$*USjEAFUYeCf#80R82g3}|I8oy>D3WMB=wCtTn{_0b>vZzGKu7B zB3HYG`};`DO$Xc|gcnMJvbUGTV&n>4Bk!LGsRIBaqO`FNKL@cLqg_&`D%C86VX(9~ zbO`u*81d3Gi?;JNy-yZbTErR5$9EZQ33_PD%!d}$Uz}?-8}s(xaoBk(xTkH6`Ow*+$rmO!~P#6XW4X(X<7r z>5}k#_KB^IS9Tt?wPsWk3nO*aJy%@K*v_`XO$gMQbre^ovtAM|Rw^zgJ-^Ebx3$Ot zsd$X%Yb?2MV~JN;P_-DQds1A@=cdX3ZR3)5KYoJq>OmRHq_>-n)No1o?!CjMr-ZCk zYJiD*1l{Rxwf@_5Z}rOR*4ky%i&Rd5i2YB1(q?@wjOlOVPY-QTuf2m5waF#h&pkyf zH9`Lzras2-@q48;l(qb2aPu=m>5uuTnWkwzki@v~aQ#I^c1Xs!<>K?_Ng-064bEdx zooCx$YDR4`&pxp@Fh~OQ2`g7i4-M_+tU9zIhb`QuT{ybjK1M%zPPX#ccXTIMW8S!N z4EC*6C+!tKgE<6&%sKau_`rj5p{|;0*(bLf6F2>F3wIy7aHt&DJjrm-g z+>}j{?+l1>f!P37bm*<8m*_?r70VOk3nj5%^e!`}P2*!(hCq5~;PL4x_mFiF!5H70 zrWLsek^&w*cPOQV^hIEBo1Dprhi~=+|v2IT`)iK4mfaiFBBl+VD}0LH=T7!N_n!$mb7i73iOeM&@980X`m%Rc>q@R}x5w{oTtb0KzkO>ENH_huv2l&Dv#f|i|QxWckiHVD61GEa!D_k5l9VHF5zH1pZpNL=_@^Sje%Oqf^|H9)Lf8oBWnnEgT zvoVKd}a^CJ~VI}R<&Wc`7)2&%;~s8k(@ai-OTIByI610DED1;pgLeNUCb_^@=E>p@T_emCp{Uk$<<0FVrq&5i;Wz zVCKUNpDO2|yQjMpx@_%Ao|oQ0IQeo!(@3S*ex_sY_STfaBZS7-Xi5&?nuy!QDj(tQ z^~+Q)V`~8a-@Yi1eu{HJ=O1aS*hY6Gh+3mV0`8iM!YqpoG|VSP=RF$X;AXhP5%c-; zDrRvz6raP~rK99Hn!jK0)Nq8iE@`LY&AL7~sxzFZ#9TmZTR$F14abdNU-vv}Ra z^L>~+Mi7oMAt*}7EWXx;gOLP1_vc981O2*w8+{9yd58dRLv$l!TAF;D!;4EhpQS+_ zye0dSj3~IHq)w7B*&5E3;48aHVCMKFITpHkqG)P2)~V>thU+iC_?^%eIMq#%EXJY6WCkDRppp*HaEw88H4G87EsN9+94>E7_x)s^A(uj&ifyJ8=0PcJm%ph zD|X`u<)C4D{G?)abd@{*J(48DRkeng^J!2Jvv~zUL>rk7xC#iD5Q*gqh4G1A-9O~&df8T0G7*D;wxyQ4($63g$&ULksAH6>(^$HyAJU~Z9mxsM|t zu;q^NR%;oVoih^00-N2f;b`wZ8TjXfu&Y~rUVLe9 zITm1+eUF{t2||ywuwkcd*CmR6=+QUgXE zrvWh6mhHKbnvkXYjb!>(@4{$(?&dOGv5^{-tz>WVr-u_YTZHkKtn)ILI+x!}qBJ1p zMKAuk1j_IX3V^#lSz0xRk&SzQ7>T&o@|E#x>$lGJC)tYy^kcoytNI{2WHfJ>TcA2SnlD()^ za|~uRwbkr7+IISKfx!>%u1j2%B%^M7WHmnR9@{tQklb>@LCf2LB2{R^g_h1xy+0`B zJ1tF1m~@C1bRl0ZxkiKkv_Yv_6=c5&NsCx{3>kYxB?|yT;)_%3?(&zwdC^rJ;w;qP zbwOYYXj&tFn5buHLYQ8rB4PpS5Iz=LVW2QgDQ#`{8ICl2S z`F>o_QXk_}lTF5y(d3;vdiYgccV^)mcJEiF$yp#reThb~OyNEe8ugtO?BVYm!;9({ z$c-c!p=F+4{M0>)58t0sXU}en$KAB`(_pmCX%@o9k|$+_qB{W;)9Z?~Vc8}UV_-ua z1DI5kQTc`xh`4_vkBNMmE*u`^eUNKrinq zpNp3(p4bA^4n+*dy3GGYi%iK_C=lX^6Ed?Yr!Q5nn20`+b|K*aZqxgOA!cNe9?wZwDS5!vLl}JN(rnzA zU(xcwwNEiexn0^{72%ko|4I8jH6AjRlG3sO8x;cwU6@(tYV`gJK(ZBUJ<^}C0y2$9wb3SLZz8P9M*@?h^tn`qOTUR?VMk^-Dy`^ zigY&9pPWDbyfb|mQ@sRSA_{BwJJrP)< zGP45TP=McYtQZc3WK6tj=K(IqbE>o>)Fz!W&Yw!Go?6w-Udx>dSSxCpfd;U-oW(ap%_h1%w6KzJHmzOX?{e_=N1NMSOpPT(lGvVzYa`ZHxgTGfalwcdFKoD~azR zc9dzUoP2$oqx^N+d5cTIWCr=D4A9}Vt)9ryCWfjWO}YkjqK7HVvA!?Tn*W=He&OJ` zT?5zbP}xoA>1@x*yFq(UT=#F^VeSQ3+2crGUjaOVVH*8MA%66;3c>MXjT@Lf>nmo=eG>%&(7R6rXWit#voJ{q$j_|*Pl72Yol;#n_`gMyf z#lu(J<%zt$(`SkT{{K9UvuEF-fR902{42kiq*bEDLlcM>(t@)*Q|}ojtW`c!kE?A{_cWTlN%k7w1;O;B(I3+HyD~Gh47q z&g1yKEQYiUxqjT&AA1F&5;ZcKivKssgGRETlc#y?iOn62 zt6Da>fw7RTxf9%`5BrIGhJ^BlhH()xBK{|j(dU_`r^|Iqm@^Y>;)*3y+W1Svp`?;1M5NZfp=^1;NQ#@JDrY+kr{RU9kX@ka?11_yGth zzJC4sX8Zb8td?oUq}vsMYA0%`{)QXU5v$j<_}7BIkOGdRYVrs3u~RYc4A{>Yr~R_? z8F(-e5QC%!V|5x;4;`<+Qo}L|O~I%F(C2K&$1~yoH?Dp|uBV%h_L0MWXSRu#H({6~ z_lAGWQ=z`Ms@H1qnr8m(u(sZ{MUyt)L^@T_^-cS(#5h1GiPFX^m|^<0?cXTMAO@LX z9p>3&8EyTh2*z>aQMe2LGd8tx*lO3+F;M+yoNCo|6gRDG4w)Eu`G+hwZ4MJiGvQRpVulqH#e2Di$`OpoIY4)_Gd?<$3WUZ0p*smBr$jQslzb=wM$7O#g~wEI3Qd z!6KQ{5#JgCM~naV7IcLZ(HReH;B)fFz2(?ThQqv3?H+FwhEVe8=X3b*_^rl^V~3@O zjX_#v8FCN1wr750Mn}IDxD)eXwiR^Xtgv4LYCEZuzvt1VgBa44HlOSPYM=jT!2Uo` z3&bTSqGbW=&V>X5Bka7Jw_D=BI_by;))Uoi+_XzMwAB-NE4@7;8mmQPnBO229DG;* zQ!~(Uw58A|$MH!<8vzuA#jwuli#Soq0M0s(Ma8y-8IHEfzCAz#Z00;qlR?bzj$jhe z4(D9G`9*l^KA@%x&p?zxaI|g=L$3Z?Mj139V+|y z%iM^YBM8DVpzI<7e7YQq8sZsQ@oF@XuvsJ{yKfukV*wgTAP$2wAsG9{D(zw370TQ3 z5=b6W+5wiKGn90cl6IIQLNGKHnzbWZ@YWxopYU;C{*OWt!w zp!ao1yCL!#1OhvddwKbJfO`xqaYVaS^A;hn?WA4()q#Ev9&=Yq$sE8gO}+wwL`LSM zGHl-a&bDapxSiI!WNPGDUjqjvmj$rREOLk@L|g7sa;NwW=(tpaEBRSzLx-}i6G?|Ze9+dFY(cL606&Co>3H0OUzImM{^TUvPo*I-!v^_LiqmA<%B9Dtvn@0|RjqpRzu+5Qg}KTd7yxOK z&lq)!-g{ZSXylt+M9F6FI=V~C*y!;A!51v6-}!L35q(iL&v()(;rds?~~WdTdJ1G$Xl1IQCsXJ z!QbM^BZ1@|mwgYveBzg^_nC^BM4|ALtz|-~6|_?KohMuGeOv*X9GxeWDEjP$XjYa9 zvG@ZJ?jaJ#%zv~q<43SY0O64Gb6y&6);2Z1stLNh%`IK@isAcc&VKY#w4>M!xPbs) zhV0rg8(Dwz*@(4I)>+rut~vZP!kcIuQJFj^`!`1ztGie{$HCU3^gNt-CN62Z0KB%# zM$Ry6wOGy&zImGo@E)2jii@G2Q*kR(ORg`U-}UJFEAZ;DT3 z1&5x!4Ex~ftc04401~!CMNEm(J$|a&r&fE3j)QLH4wz$g*X5hFSMpJcDkdZP#&KAz zxcDI9wR6}dON4}6N`eP(5MmIb{_-D-nfXt5Xt74`iF3t3Ab$bo5)ZXuHqBRApsaS) zKQ>%!<}=)qysV;9G9ex}6r$RIX1h^R;!)z&dt4!-^1v6_vxp=Pr z76KOzM4$@75B)jv#!r~oPD)0C;jaiLv4V|l`=(_Xw6@zFW_cfAjvw5(igkMU_wEV0 z&X9yWmZTRwl!ieJI?EQ*WrJkj5kR2K&*PMv|8lX0KUhWai;RE>?X^@@EAPN}_$C-q zMhVJS2jPR%%Hf&183#l}pxw8FwT}m*I9Tzn=sSSPT#wI(Q@w!p_VW7mT0V~SFT^g- zvm9cRnoVzD7y7k!6Tng+Us44eASgLd{f^Ae8XOnBkpDZ+niZQ%{sYjtme9;=aBneL zWyLAg+eoF-E|G{D7ta9Ex>|={|E4wd8Hu89 zsKTo0{0n3+4AJtgiFmx|=!k$}U}PZBU|(r&z?IasxrM{fa_}#>Qx8hVgUWfq(~Vg zHSL$|jB?qP&fqxNXuYMPs}J(@@UaC`MY<>2larWTGgK;G?n~?IBy=OU2gr1|@HJIn zXLo;}=atWA4WJ#7iFPYPOOl-l`G77Ef&$`*F9ACEWv*?gm9}vRjD*ke2ZiN5!xp^W&ERzh zh6=xz6FYEda7h--Cs)JH`Wv@>CVrF#*Cs$SKwt)Vh?zHUmMCOdtibMh3v35VmJ?oa zBQh_u1`lCB%8R|D5N*YWU}3L50%aFG=I}`sCn^UYpST1@@vMN(=^P;PUb1Zj=SauE zLn@i1gXgnkgGZe6UyGZXk9AV)SdM>yKEE${hk@R$T&1I!OBAAa(`C}r7E7U+Ku5VKMZ*(X{^QYdhF_>U;zeWP4 zR9rud(~jO+Nub1F3{}*a3&l!nQep-)1Vr`mBAhwJ@;;}JvvlF$iAl;5j&oS z29x+}2{CtvMx&Ks>nyWw&9H`mMMU@c{Hodlf=ziR z;E;|X+Wnyo4wtqME}mV}7~f zy+WPAB(kxpfAYyF@jdPE%9BRu@UfZpjA4X;>Sje~i^NSlcM^IUd&#-uy&JlAYfBSh z?z!phmvFpzTwqH_A%VZHVvC5IY^Om?SY{-pC)>sRQ-ZK2IImj~yLs9~gaW zhr$bFCv}La_cW`F=1_cT2)r9=Pl$sygB z=49Z)e4CWz;mSAI5{BK)E}qvA|MpkX6_@mi@h4;JHwh*4aW}jppMSBV1iZGMJ(oja zuFE7({AJ~+M@h)hr4_8PSl)J(I3NS;C9EkeAJV%doMypTF}f_x!TRAmmBB-|Oq-anJ2@*zkT85dgW- zrZHk0Nd-R-v{Y2R9Z3lp8nNrmR_Wl>=t4Y6f#>eq%mSX8>X`}NElZ7`51K8Cv~2Zv zM$FSO7&gz0YmM~qzaE_vr*I;n?Rv-0%&YJBBuNK5aV|8#YBg`Lq_rpxc_-J*G5rGK z9aE;UKJvPkK75g0f%=fLuQdL(t;uh&9&uqD6kcngNXs=+(;P`fnQ^6B_2CE%cl1t$ z94>HFUjgWwNeEP4^-*Rge+Ef7o+tSb<+KXd)E#U^ha;f z^lnb6XKp&NQ11}SHlbIWb2sPYu4U!kLzY5X0t_-UyL=`W3&veCGp+5ifJw2Ngt!i~ zdXFD!h~b5K9NcSvC=r@$S3O}xI^fi}8>fp`q95A3MMqX^Y5Fd$YlT|{^CzD%WK*?7 zVn;*@B`3~(Jp=pPC-n#Mr!Wl_* z6vI!u(03Onzfc2IH=p!weVl^$JcSPIq0kr>aMEgNHY%bwOX(NLw<_1a>Xa1&U8Evi z_ww6i(bFa-w>6$Dm#H^yIHpZ+!Sm6(E!U9m`FaV^$0^gdUS{49Z<{)9kZ>luWygUo z06@z3BY#zf&{q$91HwAC-W(SiN@WtB`l_SsU-FdfQ3Kgg5Q&JPMcEe&b*lHj76&OO zf0`0I%7{plfK@g!DXAxSID|%ue+!luq#Gz!)_(NwIX175kuV~+Hq0HZPOv7CDz5Hb zf;CsLZR9>#>%SO~S9xXRTXs?8Ermu)5IoxGI{9WHc-s88GtK;p+OF#MA9Q335{RUC zbn5zU@9a~&>DK5J(#B!cCD{mjgNa{dTaJfy`o3%8gBNrhGpH4HjX;e{Emov^#Q7Q@ zd}Qrn6B3o>)#8!yz_)5?jZ_YC^oft+_1r0J9_4<@C|B=ejTawOU90eT-8=vT&{dqr zbK+5U$aoHqINs-NQ6<(&&v1_)eecZ&#CE5xOdCS2eMa!WsgtxHB9j~`OIRi~UI`$t zT#Yf1jBH_sfXLT>Hr7G7(XKJ?CYx?6fmz4kEeR}|T4eowvM98k>Iu|gxm7UEJkm}D_2 z!yeo|KZ8<}Q2wYvYX}~?*-yt{>-~0RhDV3PBQ|3WG4kPVY0Gzqju+>(6?BwU8-mZt zl=Hp|L9QB%Rb8GXv%k?$zj2(^hjwD25KDeZ7~khPQ=Sxg1JuL%+T{}$Z% zOikOG?yE~7iWqc^%p+xcYrTS2PWiXnAKf)-CRrarDp^M~JMi8>-sjdupAf@;_IXGiY%)pC?-=(suAp z%6&vZ+_g;0#rZ}g)g=D6=}nh;fb`3MZ0}@2in~c_*nBs@hQrfq# zYk=tJJdOJuB#S>xn4kh}DM!;FiF!HiW6`)N7AA-`U&87yt~CJpdzF_fD2SW9=!CkvBl z|5O;`JI7}=BTO0NfK#`**^1GVR+L9x5rav$m0UoRH|@NOQRU>smV6F_-d-q(mUi{j z4+qd}|Mihf1MKFj1mQI+{co?lvGGif4F4tv%)IcKo*D7qyPiWjIl~;mS;w3>_*NMx z+RDZ-5DcR1`y^BTr|(jl+kimHE&-!KM?cc*;*t7e^2!9qixGsBR=XLJ1a-=>X{nRD zbqJ9`n3ly?5?9Z$-`22Z+cJ}unL(XJZlm)nO}6xree$%ugHp$`>j`41lcb+s)6RTi zzsd>{T;i!bzu0RcBYJ0~S|2||WFJW$9@OQOdP5|ec7@9YtW*1ntWo0W`nO*3q|CH; z+<%f6kj(K0R=PI)xIhikDyw=dvFyUDw*n0TcM%c3--D9F5ds=9s)MOFFH zLNs5w(E;c6raU`SV199R?f&$%HeA?w6p%u(X+3nT+6JZp9~HZ*w(x6c4FzQOgG}9& zVN4xG4f)Pt>Y?X?k52Op!_~x1=gQ4N+$p)b;=<#e)nN9LrWUk)c6WbHg^w>rT1_3sxXFuXj&jC z(3NRsEXF6xKlcgm-GSXTZ=|}T9{~M9c*RzG#8z+6U$wy241Hlqfh7OFb!yIs>Hymu z>w}zmJpn-qo_Zw0WlCx8?TIuO1kgYr0MhMSQS46lXBmA)m`FtFPkXU};yObNptNoc ztBbP2qscjc;Yz@hh~&790+gvUo>tIuB8rVOApWtpgu6wB{pbz4ldf`(8YF|K04e%} zpqz8&YCl{G!4G}6mvxgD8!3?ks6u6&l)3sVpDjGA%)PQ=PKM^Zc7=(93d&^Dyz9Ul z>BTv}bT{d8%}YoQNE7!jGpAm4JE25#{*uK;h4~DmLVMWc%H68}C*6@Ph@Yyli4wj5 zouWAeM{U7-c{rmo(Q03!6n&`s$Iu-R2C_bLM<~coKeJLcHGyi%{c=={AUWLv5&l)r zrPoFh6X_5#;DRC|2=^z#rr9E?=fXEf4_=W7xR!cPK95g`0 z1lv)J4H4kBKPc0QL+~WvF0bw&xJBGQuNJ7%E7_%_#R2Rl7$hxO?h{DB8U;ehMqQbx zywJk>`rjx&dt`bJ&du&NxvgL$8Cp$6HivaV2uFl*isp8_7yEwZ3*=wFJAlQ2k&zMo z27b%bg>Xi#sanyZ<2g40B?Uqbr!T<)v^ZPDzH#eY>?w~>677)ZgQkBWZO+r4H(UKZ zKPh}FhBm)1Hz=fjrt;ghzn4$1)~m__9K$Op>`npWgmJe?@L|l9@^*V#y;KjR zeg8DGn?C575no=sFk|Xgnj+|U5YbHUAxuvx$H{Ow$GoXjROWfEPvJb_qgSe z%j0A`HPtKHyah5kvkQ58aJ2L;8C`7T9M!>SnT2_IU#Ax;mvYq!Lidy`b&p(LTBv(2 zPq!|_H;*z%P%1u)U-+0SbUzE_L~DDTv2ZQT_&qvxKR>v1)q&P?rtj0gNZa%X`SZFo zrd9;{w<$zo(8))r`e-?Z*1q`#7=*qBBz()+1!hR$q=`Iy+-1K>_E&)m?~2vX3J1^eD3U^9YBb=YgHe>9mV~QN^sK=0VOIy7LX6k8&=9$+5&m8M<%Vlve$&rg%Z&kXhLp zB@od=0UaHg#PghO0j#8``}-AfO8aXOJZ!?l3$iAp9< zf7G0!V7TT9@|a~kv%0{{qyV9r-^BA12CWSPA^E5ja!YJ3T$G+$glJPv?l;sp$h!OA z%_EE}j1jH-#ShK`5dF(4_{#TW*kW4`9>Ph$MiDJ?4VYFUfZy8qcsV9^hgn<2ExKq} z9*$Xh;hv;@DDC|W(OF=GL&Ewqf0AXE28x!hgT=UYaCl1It+cM1avsyB+Iy&f|72a z1YNGx@eTzA@yg}4$TtC@R<32HbMcEdLa*gx7%>qL2yck=;6AKx+x`n30Za8 zh(iv0Wcev?!JHNM6+b;Guc`5$vbkMi2OzSXlpRG?aZ^sNtRSjq>#x3_)MDM*Jw^0cAi(l07~ z`htQ>$y~XSUwIp-h~F49{J)$5)(^0lDmwE!br^Ex(-=+PA9MrxoM47nXiWZ;i9D6I z`mN^_Cu}bkdE&Wnwo>{FJY6j_3(JKwot4V{>JV6@DaTap2UR@dNc+LXmwCVag%1x} z_&tfx8PS>V2KArM-T!7_X`s4)sIaSx2u~UpT&-jI zqw-D@otUkq$ovK$ta6?`I>TlxYF2xnxz*7_}89kkfe+$#+-_L<2l1m|IFvF_crGxX?e3+e#_!8!OUAUR6Ja8 zz>B!LKCgvMRF3C`&eMymWO;=WJvvckvX(m}&rPP|j0}WMnw2Zd z&WoG7wFoY*&}pvEA^uG>)xSx&cXao!(*fYhVVSJYB-YVko_N);;`(uBbtKtCzXkdF zyjrkm+?833M(X;4ZD6FMYJ?2W-?6GCIZAD*?+l(3qgpAg>Up%<|A#_t5 zMh0Fw7Q0Zh+03ND0mWjVv#sQ5cC_=g$L^H%uT%jN3u?~@4U3w$&?@t&m)|NMhyZTs z(GF=!5PcmVmqMCH(UMfIDz&6m3j@QM()?%h_+s+sar}j`{hsZdz`gkJUCa4^iCth; zZVni2k)$bbNQb2ch(*JAOVhMWrXE6^cp+YX!M5@r&EDf;~5Z01nx_RuFdarmiJ{& zC3NR}-$FcVM}UkcQk*altp-jDf5XO=G*kTS822tETbFtE8P@0jo3l`}$tG&QW#OyP z&$dE?ub&6h{+~W``+a<&#Fb+9Yhc{MwB?u?6WUvU0No0M6kC_BL!fHOE0&b-PLJva z*v>TRZ{Qeyf4)NeOnF$4Zh@i{kwe*&s}Jxm^qSjdH%WLuA!Xzaa(xvCwX^iO!|+E2 zL4t(vM+_MKeJlnFkqp-nyIPCDxW;M=A1Vrn!O0}c zSf6E=AJY!?>(@<}B=UC{Bw-I9nTpklEByo6dG379v^IMyPlzm0v1)w^0C-IthqfKz z)=$hXoP;AhUhuhQLBjpL75zDZvxUn#jVv!bxu7V*VYA>Bx!+;)H~6*hxH9CVIRs!7 zyh8_`Xi6q8A>qQi3x{+6DIJj)r$}lVOHQfVH7q%@bJMqUq1d6 zqUU24^g%nM1TB`uuO96vNm^4G#YrTKj>Q?>WaOBYYzah;Sd|=-^ajg^i!7} zCqhrg82`_QJy#U>Y_1IO_tca)lyH3jsDLNS2XNRCDE8=oh9wQO_=Sma(}n|&d4a`z zUV;U;+u?+y#K}~Q1)d7s!ac0_|2V;e12{-l-I3YS>cDHgnARWv?Hd^puK2(_0O|oA z)d{UjDCG`9Tqsg9ssjHb2;w;52=Ml;yIyGvdn5C|+5fkl|MvlcdT@e5DAM8L{Eyx! zXaR}!^}d_%`8K~V4ZQ>MvjCX}GYKR}q@?;|>~abwx~p^kHICv2W^jKs++6t!-yZNH zD;Rjd^WRAdgd2s+_WS$e&k^vP z4^Ugw3%t5^QvM^Lp}3`BkqA7TOhdv0+A|-Bu8#4j1#%9)zN0^cq~=Ail%GIEpQBq`1xfbtRc+X<*beu1S3eCv0f|8Z~-I~i8Y^a}4TB?7M9 zKj3VOWq)hFFR7B{|0wQ515~J-$CRu8uZJ4ALD~Nbq;J+MY+;9SnOjeAv(gyrsW@Hw z_9{H4BH;tWGf9(p4AX!9O0Gp=3}cYdw298w2Pl>u8ro;*JSu29ns#%=bH6e7y$5(* z#d^tt#sCZwzGH>Ij5DbzVX4S1$o52Bpv(R3LqO5MdA`1^1LCe_h>y?kK8%16Lx8|c_rKuA#7E$k%^yQ%CYmmKLc$0iC{d@R<+`@3 zle|xRSImcLI$LThdq3-iHAk7G3Z_N`A}R~{sCIg&ecO#GHnV;kWjvZH2O^#3M5;oJ z#d-*I>N<{OH?ILK1B!+41PFTbvLJ2!`l+dl6qfSERLOkmJv=+lcjBEn{Q%(5Go<;aBQz{bFd*$?t`gs}vhZYJys54>P&~(D_CGkowi8z;^clP3 z^*bhp!h2fa83v)y)rG@j_5oH8L0w~5V7xFOUjVW(gw0*pQ@^D#=Ba&!uTBn>E%0%U z46Kxziwr`;2c~l1HVXi3cXj|10ABMQF)^qSo{CGzH=O=q^Z|^i_BU@$pej8g?17&W z2bnqrdc*A!MdC`lsOQ!(H`c;utuj!%Ea<^uEkX$b^JHgdb9O4Sy&}Np#q8?p@>x0$ zD|U_TU0eTaHdbGF2EPpKo*^ru8^Lm^gdU^lgrkK1|>4XN8;>LM|HWL!`Ysu9|gW{Vuhq*7(7r7jMS9(v;8}9O*XbjF;Uflx0qzLaSQ|*&+ z&pQ4`r>w>LN>^LM6S@qI!LddQzN6i1uZPN-=En?Qo6?*cE_xjEJ5ST03PIadp(k^u z(b+%POj<@qw~MRi-x&B5t-Wpy4y&eYv`kxkX3;28E$FOQCvw%aJ?pRmP|lE?!uPNq z#P@nbK||L`Vn5ihw7doO$@m*W7D>!LsxfIZG9o=xm=X!Pnj^`CTij+GtZ=kqon(@v zF3{#V@#c`TiB6$5-Y|h=-dJMB^BPhmTR+r{S*~Xr@+`^skX;tH>}GIF1|6YRQwTQ5 zGQ)=d>at$8cExdRkfrpdukX+nB|6}*U(1R|%MjGp_wj0q@Y#jC)Tkwi$_YF)X4wL{ zDc+J?%ygdQ>g!{qH-}y|>y`IbTP67x3k~SKpYJ;K#9}mwYXoxO?dy|+X5KK8MFU#Iu|x$ocaPoDevx}MkdxaN_nLwWI|r37pG2ciRF zYbr^aY(g4TqK*&}f$Js-mR?MOt}*i@g#8#!>vboSne2gIsx{{R<#K^nbSITF1NNIU z6X&WD0_zRL!rAubyJJ328`wOhm{)Ih0$LjZ4-R27F^whs*9}VCF8-Ll5N}=$yK@2e zGs{2ai2L*%Ux;rJ5P3>Dhesc`z@Esjc08cpUj=H_22^ zwuMh<*z5hu+?l6Auv~!9OKW{%a_&r+=elMie8H#f6^nutRsVn_U^;ZQKuw%XWj@BY zZ(25T&NwcckPSxdmi*F!mdGPj8yeJ!iPVMAJdnpTI$JBR#7v5B0sDF)Xg4~e_{^YI ztGF5VjU){>CJ7s1T-1BfLsO+Sva0Ag54r;erHKds{@mMe_^cK|z&0}~u66e1iuvNE z|GQ7jEMUhHvX2Ahj;>C&hXOGlWqK{5d)#IF$2ItcRR|FAZ4ioJwx&?(#u5X zlRw16ai{^Gco~0gmdrf#ubxR?-J-*tmOfsxZmTS{!-AV0c)ldpJlt|@l7<4TgDw_? zu9=@p*0pvpxR`JyZAt zrGJJE=<^DLJLcVqPc|PsCQtc3O?&a>fD0yKf3QGnrc>h-;j!h^!@SD9Id(Fu~@fp9K!F;}qk3Qw0KX*) zfRDK5p^T3&YMs&cd&kW91auyB=2GbTT7Fhu7kjeTx&r}U8`=e^sKD>3jfF_`(LF|4 z#^^-iQ3hL4wnNtw@lB!aga|fUranljE2Q7%=`o$>zz_69Q-46S?o@Q845p!l^xl70 zH&k~6!A89r+}td0Q7Zx|0gni1SPgk?$E3WJECx7rAlT#Y-^0O0bTDG=xiEl=u= zHOL399_xLM#2lWan$hjGkTeWzD7eh8B?jaOl#?1Lq3OK0qgZ%5zSoXEVu)k{zu7Jv z5SmAC_4grEr3iG3gC;HlfXm^;COgSqJLn!+TS18@OONyOc^-#Fmk#PX+@ z=LTP@m@ahtf*8=Fpf9xsfFN!lrbj%mxs9bL<6HSeQPUabBTO0iL|BuWEbkF{+F>b2|};-22~F z+>95wZvoC|llI^Vus#b=)(sUK#8>LWmk-|Ghgvh1LrK_zyB9Cex~BiE8g$}Ohk}75Ku$?B?dfce*)gX; z^<$50I;PpBl3R6;1pp5i5N{Muj+66V*H<72aBzXJ^g=-q{dEx)7~9;|ocr*w=a-jn zKPf3`j*f1F7@ZIJ;k!;mjqhuQTLdN_6?r@HvJQ z7r1Zl;-M8r+66r+{4F;I9+~sj?;CjOxHv0D_rD!p^0b$p$pE-3Vi6l5zJbES-)|4J zQ4IY)W%zC+pg7i2@D23QsZV%-C3Ltv1AzKn!xjvMw;_qPETe-3QS_>kUPIrX&ox2C z*-tVyI7Wb23dsTM_q`wg*(pd#aKLx5bywT~=V{C9#Y!=h?&b<$g|bX{xKa#X%)`@5 z0P>yc;`PBe$j?9t3L8Qkf1wDYs>c2C+Q^-CVfRXp8q!=HuF0G6UcYyeA0PB#xM;9*{ z3P{No9ra4;88Q#z;^635pKoe%I_wi;sG4*8vATTe2X%U=VP zqvrTN$~tEOMSN?$kQvPw&%tjF?V#w^37PU44tnaZ=mU{fz^oGk)wDW~{l zrOHC#_%_xzrp0!P|FdEMzxnFEhxhkBz|qkls)-jcJz%rhBLvA&f=iLBGs*bQT8|eb z43(HG;ojo*0^+m_;{huVhuVJ^Oe|a`n0r2I8!P!I?rvS#q6yO{!@{npEG z@YJ0WmBex>vDtSYDwz5!_xR?Ye~v=PZHvB;nBLi$)pPT*_1QkD#FSilA{tXsB)9=L zToQHtQ(s`F|52c3d_oM7s)ojZ9`-PANMB6fr{p)E`Uw;*di0dtV4&;emS=*`)sAqw zfyT^U_h7ew&-?Cp9@Wm#{zTXaB6Pg1Xr-#}S8VpLM|P9HyaV9vI_Rp~f5aI@L1Qma#{ctXuzP#rdy1y&?n=KqqgO$SNQ|l{-g75P&UYE z6+-fIv*JSM=8cdrb@929@9t3W9CSmQa!u!L$Q@%;Gl^Y6U0UfCe98 zjeg9Q6h56D8vYs+^*HmSyopfofY`2b^oRLPAv|;7&;h-^Zk_+JQ_pMbu)Oia>!`Nr z`ndQ3vw}J_2VT?i=LWVk6GQmF_y5hA#%d!%2-2xJ{Xj(hD82}KK`#%DF6KF?z0d2y zKf-nBz%X97Tg!v?scqT$_O+x^#X_J`*44PC3NIj*gVi?fhU!h@u|f8&O1$v7KU$Fh z(6AyK7R~!8HgEKdjN=x zc5FOrH;}2QGA>TWH4)M#2VZLJ1C^|7AM5`Qp^U;89x^!t&qZA?^|)B9^64=tsP?>w z`IeE5E?IzTqiW`}_d2C!;r9kFs37$gCYdJ|nmh1Rfu9yMZ!&HR9ro|!=fI(qg=^kM z`V*C#Lf&dJn@r$soHS@^rFPCQm1MhICHTc}(Zds~w?9PurzRRl*G}I$dNGaGrDA6O zre=(!Qh>SDclerCYcmrc{>K10-$LIRYfZec$Trn=^e@Y3W1f6#5_AIWY`Or#a`-lG zbrKJ;b3ldWIquJeRge;s1p(eUl7Pd_>S7z|orC+?I=Jbs`JKSWjGffmN_y@^{QzPu zapX64F>Y=9s7P!Jteq$ZaDc)|L==2B*8er;~?dy+QD# z(rHw!5(vFhe6ANs&9==mBN%bwZ{Wyk=lG#Fx3|{bU;qeIm7>LuA+^!3{qA7$VB)t0 zor~9{ygnhWmo6KJ8?*#35|us7Y#&#&$K&s*p|r^ZnOp*Zb37FRm0e=^)P-tF`jzDX z-lxPylMAilkw$Hg!Fk{e4G;v%S9PoKYLn3l4&CdRq4c^hhK)Cget-j#N~%2( z@SDAVK(HotzLr*g8{`9S+&@uQ1k%CV?cH5@hGq{yjgw|Q2c5P<|s>C0bf0ymNQ>nCPisC-vl$Z$O1 z_r!1WI*h_r-s_B$iWF^zDbsvTbMWYrnd8UUbl1TX&+sI%IxTUHytLKO3GYFAAXTILvfi z-7)u=OXzn|F@smmHIFvuxI71rx5YXUy;%>F8Gz)=p1mo+OQM^b;sq2?*m_&q2Y9u* zmtpoDfDzvii)~RZ=XMwSQPN_7IkJ8GiS0r14)VLtU-yWUVSD?ud102@vD_PBrsiSelA!j}KVV;T&{N zhWpoNmVk-z8VUF*!Ey=?RTmM5-9YXrfN*5nz~o>#z^3=??X*TB+y6n7?{EFI^2d7~ zKm9+{BP>!5xFUJ}|Npgfwc&z!7xf=xNiEIDfT`iaK|NliG#ssG3NxwXHa411RG%?b3-K@!ELxUqvALNWzXEjHR;)_N5= z)LYE!+$INWzI^EnY|~ji0)SD#*wg6u9ZSO?TzrzCfH-3W*ZfkgR_t!_c71wjVXk&I z{h#1FJnw62ZD>r9aERSopCOCFHD~1i?6ly4khI(OWPKA2zp>r>jo}9-8v~t@th}sz z(S>Vl3C>1saZXitYR5enTR*q^VCJlZZH8N`&sKkg;@`eYbvnf);Z}}9s}SEcD8ev# zS%~}K<7bb4amsrAlj4DWbRkyQ&twUhsWeby@j=k|lT1|ulQ~A!;mW+$T+=_niut;Q z1#3IU6+n60Nv!DhT)V;>jV0z2vg#L}F1ImnP@GwvH7@)TJ5_3-vgLU-?+(ubf?&_!U%+1&15G>|cLpL30?xIU@0 zo2wl=H(X*UzgNyAxI#>X2$c(%ezKD>5H&TS_GjB|#OtWOw_dfbMIM2u9TRn17}~O! zt&psf+WBpP`MoZbt&!r8cEcn2*z3&^v;CK8C*UxEEpV^gyCMeh`si^0bnp0FLR;GK zf45&6OHwS|vrGP~Sg~h)Mz7!E4K+z(Kx%3f3qJAMi+564#)KxrE~pi`#-kynd;07q zo$)sl=4R3EQhA``l8lHSvu{4NA9~FRt8_wSNDa(YVzXD;E-rW{fC2=Inwy z;s&I20ksektDI+?{vQQ?`}jV_K(Erm!$R)GGgfmOlcy77E*E(x3Ah2hbuPL;TiU7D zh3c(%xjD;ocmt6kMIkeOGpjENrhQdcjTNV^XY>i)sARoLDaPa$JiXOIhkFEbs~!?e z{E(Y8!$Dazv2jccg!ZNS|NX`kQ7-kCvMw}fD`Rc!`Yp||fVC%pJeYpg&lq0t^YHW- zkRAKxOtk3H-iLrvfBrNdcX7s2`VI~Vc@RaiV_!9A8B>w-5w6!~!R=~cvZCNt=%a-j zFmjo~1zEq!bHO}_sjr3psw{<@cdSSDp`r0ymCmuw>W>~&)Bm~0a4n}W z7cD2wW4r(iV!CId(&p+7%IEld=d!H?wcSy%*h%HtTf<{g&Pi*pBj3XG?oco&pfGss zZDK`WG%KL3SQA1a-`3v$tTiZX{(bVoDM6pk86WS0*b9)3bb7q=6JO!yb|yP7Y16nYU1z zrh8+k>F^VOtBd2HAU<5#g`6vk!s0734kdtP43fg3oElv?J^s@K+IoEzq$e+t*uXXg znm3w&+MZWTEeL4Cfdb+HR^yIGT>s(C-wfkk zPe7{INKBitf+AH@@YkM?iWz~CROP{;^o4#F8*jOlaNv*#GJojr=biTpRjIfxxtU00 zAq+gkM93tauQ>itI0OfYqr!We#*!*=O*82=9P-+71y)m&^868S9XPVU!X)yLH(7UV zr}Fv;|8K}22q?jZRl26Mkuy&1DSAb@~qgQ1yxV!g(XCZPkCm5HFf+9u>!*$suRh!Pl94r!-4Y>VVddK> zqrLgZ4{#zTBauX65NQx3-P8CCKgbI>hPg#3gVl1f(z&0KYHzfZ2D(T{>QsCdQFKfZzr&uKybv zPzSS&H_5IS38;CB=GDPnemW=+&;vAg^_KimKOkUoZwabKgK=?yj}Np6Mrf^>tRFKx zKD|5Z4!Q-OCPp4l7;pJpce0{r2Bf!q9{Dw5HonrKYD>&{WNuE+x2^_m&PGz&m#-Pn zwLRI=Yusujsy|9h7Zn^2uP_{!b;Iu3^YsQYzV%fW{fY5e=raAXdcQeeuiiu1`HonQ zl%sk5tJ!xWYG7PoIaU1TU(p&3co-&GoQPQDf{AvP2Vtcn=EQMsbLUm#>^)FS@OJW4h6!M8iJ(=G@WmaAa8`I8C@s(7t_F#I{~-B%9;%Vi@y0uAbQa z+Dgc({h`l=^v$gMuz4&YOIq#K@x&F{8c`12iWg#SV!!TmcVNVL-XVD0Z8Wkt6^WL#c=2IEhD`lxpXJ3n=g@}8OgcoK=#EJ^ZAS!hPUCaLHe4OF?R!d@KF^SDs!HLUOwN*hu}MOM)XgV zZ0@@s8p)`st~(NuVG@q~d`uz}&GlnYm_eUjQ(fQn3oS&9^qh*#uS+NS;BN#$Ke?J> zqz8#aK>yz`ooV}Vd=f@?egk5 zGx<#F5M||R(kz5SpVo+oh4aEDH&D&EsD|W4i-KH#_7)pK(leDG$Iv>d-Sd=HOV;_lp0UdZ>DQJP9%GN`u5-Lr=#= zrF7O8awT#T!)uYC00nsg`^X$dQ({x(*!8$X+R^kpJMoUAmmcu&Ue7vrkO2N$npq4j z7;@{}n^CdAneRZOt;I&=?rjhT%&*y4P zIYcJzU$!E4UBfcPUcLO1Df?HJ;(j;b?ST+sIY_3;hfxB{ zrReEbzG&<_O~$x~ukHg1&@J3&y7V8(`^A0xB2v3RxW)y{y)-Dlk>l;l(a`+Z zFVpb8GA3WJ3--QI!it3+2{&Vp!m~>!93SD6{b|1AAr&|L)~sb!V=@C>3&#E7Z;z{d zX65x-;XO)zzOGTDk3(RHlRQ(4%}3*}q7)U1noqLZ=n?mGFf9@Xrxi~_8tu9jQnESV$Os;`B`qPsCE}L)(A2Ho zXgk@4TS{2z|#6q4? zevwZXH2kxiC&hkh3^*vIJVN)jRMcLsIwU?-2el1E2Y6iF3pU6tJj0r2B#D!0a z^;(^3@AM}W(rQ@fjRbC`Ye7PlGkS`e0g!cM+RT7P0QF1U|2DFM0c=J0mA!Pjzb}^? zeCctcA^kL-mWZYZc$f?|T&i73H@C28OBT?h+;$`cb7>)<2*C^C>O=5z=LCKGPHNMx zeuaj*XcvIsz6k@%U1=f7WDd>Uou1)9wd!w{ z%SJnZrqOc6&{FB~>7shOh|T(|ae9SYIBe`X>yaEeA8(m;Rue_nFE#mWs(9=^DkZo> zf4TD4%6Qe{LpdGSOPE&?W~gG~Zw3y$SKnub%CRlq@%asIYkbMX-1Z$TJdwmqPEl(6 z9xmtsvpZ0R&KuWun?zCJKsO)y&KNd2Qy%jIU&wxYx2Nd)9uYq_Qy7PhmBv`KpTTq0 zGyHF5M}K=LqbsvYIPf)b;Lim&FK;)Pa-<*#9lW-;3#6|Id|P|4aq_@HddmX}pUwF8 z{`YrRXQx(r1Npqv7ESd|N!fPr9fn=~(zinV*vw@dHYU7}jOZk{UHrJE%8o&LON*-k z^Fx!BAUvSaKZEK&UBTGPZl^KdF}^;5h7>yrtG3cGOPzY?(qp&dpONn4X@&NZIUV|3 zyTu-k0GW(DVSSr8R%SW^dzkweY(4?h)LP+DBJ!I+bRrl}*^b435JV znZk(*xc3T%l5jY6Pj1%^p9pKa!_ePwyn8d-*7M)k-y3UT`-+L;n|dCb`(?K367IcC z9%tC0P`W|BHjYrqC9~P1)2RhOt4rBGxCJ!81yj|!&hL5I@dFL|ZhsU*Px$mJ5@`O# z(o?8+%24Z9Z#X5rOgudKGf>}q`(H{0(MsC`o`%IJU9bJYB}a^rQ(x$eClPi!Xk<=w ztqUgE&}m@FkyYGn^MH&0-wm@x-(G!!4RrTcc(gjqQ?IQqVQY9lo0dRyL48I-#I$z#cU2*fnR27 zh>If!dj4!~C`&n+j&na@{&B;7cB|rN&XbZ8C}*>fMq-j{Yvk2b1-8oB&)`;ng`=jGo@mLN$~S~D`g8iz&qr5UFu5;AaZEZp zA6(lGC*3&t$5CKs>f%w+^Xed#=<-_VTI(VM_w>#W9EPpCjIv7}w9a_?gwa+#Py3=ePIQw*aI<@A@Q z^_uHLedb8H#@+Rxc1to&M1LMRLShtSqnkiUabjJ_G0oak*Q+U!*tsbJFXwW}-8k zBuP_paa60R1z$tZcCvckmBVN4>&|~&$$YOCUKImRIDBOwLG$Z_AsuAa6wE}X&Kky# zO(19u!oC~^@k^=$BvIST(wEWfYUsv=VIH@`Qh);Cl7r0n2A=DiKO9(P;(_s>V?^!0 zH+)IMmX!On#><%Z{-$@kF)}}@!)XBj5zgY2?L#f76WmuIjaH@p zxQpMq(xoe}uf8061hICt_+_+Zfx!KP55T&}!W_^2{DM2!Qf>Q+lwU=L;_F|_J&N^U zKmk7=QS9jiJRxZE!&jC4CLu?tDUIlRs+{c<_V_`4(u3!}>+7*oYLx3Ii1LoCS}YS% zhLp5Ud-`0EnSBFVa2A2zshVFp7I*gVNK|B}GXGpraCwPT3-9xbp}`XIVn3F{4$&AX z8e{jGv9i2uE z4EA^C%YKozn+fu+qguQ-2R~410jHH|$GAWXX+^`!A@kf@Za>#5*ih2aW(^L^p$wW9sIUz;WFVHpXU1Ye@hC7!w-o%zbsoV z=M13+|FSABNo8-LKxg<~<@#WLERnWSKH;)%W6_mS^)-m8oTEHoD0`PQGHMW4+2Gjf zuC%je0Z0gck8~-ba2Oh z>T_^#=q@9A^TOB{==g~Tiw{25EAy2{+E*p}NVlo=LwT%?oh`pC>Ux!rKMVpo;Via& zQPC~c*QXG)A7?~Y|DE@w=ZB4$eVmR}vo=9_7E`g{v$%SCO&Oq*CgNm+OSb{1Zhl5?;`Xp(l#h**Vk}XiN9S%^ zSM&N8zQeG4`*SHv6y})#zi1o|q##y2IQZL3Xboq%pD;TTPqBh&oZnBD^U!)X_TqZo zkv%xqnrJxtx;{|+e)G3V-kD-jvsu$^L(eBql!6qR`C?hshBZI9k6%bV6;6gYsW%r^ zY3oM09^4GSs89dRX%ZOvtN#r3V3d+*89PR|DQrbtB zJhL6IZ@c=_DCOJ~yeHfoh?J{_Ktw?w!*!_^uL8e&Toj7X=}QaC2{^Q{ZH9of#!E(; zS;Hiqaf}6YVd1GZBilyP>aYGNO$!IY@SiSg&cc!iDBmv6t%1)}6vt-;KVvB0vT4VW zk2B)BDwKi{mkCwxa`lGu-qY|2oHV(BiHUdr35PoV&%~SzAK3^Wz5oNjdnKJnY|j@L znMz+i9ihwgv+0uH_leGnF|s|b2=D9zwXryT@B_AcXpHxy{gxx20j5@wn;XOH zy^rPZx$E%=y_MveRA@U@p*kIF;nJJ-fMFJNi!BB^>3wr`jcy;$PCn1N%u354i=C(i zBQ>mg_e4>yG)QRIV1HY|ds>-)+LTKbGs7P8QQ3C=QLI=A2}O49=u4wk2M*Ckr&hL z(odL}Bq(AM{t)3Zt(n|R+uFL?GtsCvrEsLx%R-=FGhT4ZXZ_<~^sAD^9~dDW^`kI{ z_{T%;A#;&D!^dj0>mDi}hqG<%{jOYZ9#8GQ$oJ+@z(jKS0QPmTtUvTq z9dyvk770feb|<5h5)zGK5~Q{!F3>e7n}$O~+>UqBk{>g-t+-ytj&WI5>BG(v-LuNW zW9^AwwF<1E;(AHMrNuWVRgOgrwI7GmTy%Wjhbym4bjODsZ~0?T7g?{kA95eL^h9gu zQUe(?UBinvBy%PcRl;>F^3PI+*^e0WG<2GB9Y)S#9hg2sKkg2xg114)c#u=ZU~ z@&0>U@2)vGn&~x@SZ^)(7NML)CRaZZE_sga&A?*%)!RM=oNkWk`DC-wGDC9XMiU8x z@DwOyi$0{SiEy+h&)rc044bpR?n~7v3Uiys%3})cy@dda8>H}QoAdPx-G_`FevV8- z1@7bMD^em`J!R{bdf1vc$9hh_5^D1ntj}Re^P_)va15DC^&GiurYz)}=eaI0Qxr0KAp^nA4S&3OHNnoa} z*Sie}rVkunyGjoZZ;Nt~8ioxfYj&U)Hb>uAww{Dk(~=L?`k{NjA(4mq(q2@>jNO-?DQ6v{-+ zqwLUnr`_8&G3n}0dYH7Om9TNqKXfPYIs)rdZWMq|Wlu07r@^?#sWoBBnet|9AURKA z&jG|3udWGcH9K2g)zG~cIf@X@UfKQLzq@n}&s{R81K%d(HHvAnDzJDnjsr6$me}G$l6SSu! zTxp;0O$zuyH2B5?^$zZfeocAYCSke0*X7X??|2r}cb)qn`{!BI(;+fO=lZ7ilUOxI z!ys)hvk9uX#QbW+u)m>?d=Tc$JgbQ#vs?X>@cG1TaYlMAz^AtEEwZ;O;+nV$4m5Pi zF_0DTIcMqMlZ9{L&e(*z^7mCxE#ZB&1A(nU*VTom|*(T~CMY&t+#tvk!?ePQr zLL;J6ZFLfKymu(;y@6ZT;%%2->-P>he zxwE!`8Y>1r^_f88WNQq-?D;*DRK}Bi%&4`!b{F&nLy9V zQWUfMWg!{Ng z=75T~Y$}i;^;p?KHkF_>>$q|(HFzdMg@urqNmHJuLoQ85mOM=W4>nI+=M?l2K_ued zJgzd~^z7p>Ye)S720qZ#!nB-vK}jn=QSm_Y&n_86Tx?cknWSq&5B+A37N)Rzda;2x z=<41AEadv13b)(Zdu$4Anr^!X6)wCr<%C zFhaY%o0Q zed+s(i4vEI0qli3A0GNSo#5YC`Tex2whfd6{BdxVuPdu7p7uTBrxy&?%q{;6dP#n1 z_YfIPBHlRzjt#c#{{`v*vjc9I)f~LY9OU|=liD8*>4aoSAsk=NbVuvkYOOWc!^TO2 zzAI-?fB7bD|2>WRi!d^^Y0NNnz?BgRtM z&TU_(WZS(!WL{eIgg3nh(LYK*x|oNmZcMQ7=DUca^op0v zWtTKDArdgUV*ARLrh9@0_zcvI8QGPXe6|C2=fWS;0Ybab)2Ry)yDx_gMZ$Jf#xEa^ z&}F>OPwJ=@C%QB37XX0i| zLU57KV|3#tt~=34#+&WQPZEhAoN|nlQAzw7vJC`}0@rY+ZfAdhzlz&GHGbp24LLpo#9l)Jauz=ZgDr)o|GVLx97#-`UaZc&cmfaz;ec@;m8MrZGexeFS?nRqc z=+x+Pu*qyzjanNUUtdQsyhL(*^?Ja^UQ`Fr&n`gf=(Pdw^yg9dmO(S~(6j7Jvvm`x z>_~)8{_Q(^e;Bx1_>o9>ItbA?9ZU@cLt~L}i-m>Vxfk6?lzq&tpz<8a^&}~_4F03* zw|et^nE88dAoUcL#M@WQ>JaM=ldIy|q@WMLAdPg7O`~E)n=g>ObaU2F#QMt{7>^?P zSP5horW1R$2A&go{UGfbIh-a?GjoIa{5y61p#GxD0xPPq?WY{gna&&lZJj7KA% z@Q8X~-BORl{a+Y3ZHWrK?4bdOVHDnN>UNWj>iOV>Iad$z-30<`Xu!_wueAScIT){g zhpjl9y}!bnsk#V@kn`xZ89ZL`Sa+ExW!wWatmo8PKOxX7znk&G>$AJ0Z&J`NYe#Qh zO-xi^d=H*d%P!1wEaP;_!XfaV_{}tjYZqzGvgC!44;z*cACILSfdqZ-3!idsn^yt7>I-UZusm`Ta4VcTD+=5MmbNx9+ycm@j4I{mdP4^hys%?Uv zseWpndf*2@L>80T1!MGLl#Em=fw&^>^zb~#lLEuF^m!4)#`udoypsYGlg~V0HpqLU zOf?(nU*<|N-omCoZBKCtWr2j?g6VBXV{CZXL3$WN0Z0jzrgfGlprfYZQ?fMd=SD z4`(UPBW_%cj5O~5A!`7CCN%mZt!EvOlR?CngbWcGk{2_6Uc z1iU!A*sPey1A}7lLhqisILY1r&Hsmb?Z-PY+0=`tY?I(b6H@t=XK5;9fGd}5W>R&n zb4rxZ(4u$8MvrcE|F5FhE7=8Nx;c}TCc#UN_l-O_Y>*?FF~V*_*r~(R_?d%GIzl;# zCYJ13G|$)B8Wjg|2&iP`$fd)h3MCR8%80cA5 zqO(}&UEhq-qEfxNqP1%wn5i85D?2CLOxWxzaEysm3V&2$t+J=tanQ<#q7CWnz$ktS z+=087_W%f|`a`5c!fo>D@)KUHDO?Yg+2QCE02n}~@{Y*I_ntsnzrT~eaUK>a;5%g6 z9-O~xtVtuAifY|tXAl13QhC_-R}-BRCbS%J+J7}!V!Fn|d366&Ql!rdpyGN^z>Ld1 z8&oI{_*FLS@gsc;72KJyP+`wX@iJ%H%alY*^8Van!D+2W-Kpe+3{Q@fXc)D{x$y&{ z)iG1Z<4+fiwZ<9Vzv`m}KEG++dfUo2O_@awiqyZnW|y&3{wu$%)W@Q`BjI))HFT>W zWQB&x!8}d|BzqquyTaB^Cv-_*qFx;|JNg8&KKr`9ylAVPWr7Vn>auxqdKMkcBNvF2 z%9oQuegq?x?mdCPpM!Cm)=)%Wa-TvtCDreK#prqJ@K@c-ONTVUbHW1aV|e{`)n4ng zJn1jrb<_!Kg{1}Rp1AUgWhf1#d)W#PbXUj_ufNi`DSisErh>kriy0hT5ALo>A?sJL zyfW4|d6{D#7?CNj2&VcO1pUMS_-PhOYTou)KgnV}KdQS;M(a{&1(<^;zXBn%E=bB- zBEo2i{r>5zkg5ugzd-bWU)1Z)fOj9ysOev?>2ZL2J&E}Sa@uDZecROQO2+OwGEsq^ zH8tJCN{ENxOGpF!5&=Xk%fS!vX9pj{CMh;V#gter9q7gr8Qt0B(w;O67DkqDBD!Am zzbA9gycY_tzTeEguG_w=S49ZFgZzy+ouiKMPSYYM&3u1)wxBN8cyrC|nGNBrk{&#K zrU~K@=P4ZPO93=J%L%ifmI#`$aME(X|E$t^)6uJ@tl!qG)?5S9&+S;#ZoG$vlP4sc(N$lb;!xxZGME?FsdIgDEU!#%xMC4!Mf=;92Dcn|BJ9FHR@WpO@3K!~{GEB%yYLmU+9^P@2d|eO{b1)QhQ zBeu@+7t6z$wj{uBb5`OB#n_|GQwIu^v~mBqb8;80h_=2xvZtn;Aa|M|)0>QHVSO0; zxH`DDKkftT$0wL?pWdNOr3xyD9suvF06zeMK9(ln*sO~Vw*PD*rmCFp+tn3Qwq49e zHR)p*R@{cZRE-S_wLyZkbz+C)eFlw@XM(tv39}LFCy(ZW47HJ>4a|MZtc@ZSZ(Qh@ zC7iKQHdjl>Rcv5Ts0G_kC8qd_Rh9EQenj7aPwc|!yJLc1vQgB7#+~agkMK}`w67F% zp%UH*t}&1x-+~n8aW3RhMC5n2#(ZK(+wUCi8_{MwbIj+?BA&<%dO!@ zC-kqW1j_w-ht?5ZE9zXGJ%$ua(nGtH+)_Kwy zbtLj$sYPgdZE|2dWvTv&MA29RAnmgJ5`rNIKSVxfQIMN5aZhZpTM+11!aKT482P>` zs;8f7-v>mBGG)e@iFna)2T|sWkTeS$l^4w#q;xIcjgXK z7?J3=*^lmxrgOSl>f=bl-XQ%i%+c*kEn^MMP`e55ikEAx`E#&1`;}f2vSVy?_RH#H0{yGC z-KN$Ooc0@&bKZVO3PI-UUU}14Dgq2r@2++MOUtF+#9|QMZK1SMtPs|s~0lmaqNSXYCDkqC3%@Nfa@^B!kgUqpDBS=oM z`E++^6GAuxP@-!O0%F!^zBlaRv)gUgujItfi+%P_yVW*z zl#N%Riu|lEMY;r*ZowE9i(UC7_jlG@AvuISqB5MO zBqm&hV@GIc?NiZy+T`zK5Fj5-6$H`HQoYcV%Lb-5B<@xZ_E|XCTF_#MN{0JW`5im& zkp2}@QYCgSJ_pCsSF1HUO>-aJFHrV5b#FH%O*-@GR7H2F2_7t~0}ni+aooL{lOVQU ztJUg~?CN8NY-#dL-{}gSrjLWXd)HJl6Mi9_4#S8`8wo+ig! zN2t@3a!%Fw_Nw|6GDAWUAa6-vQH{F)6e72wL_ zpMA%_;K4p+6b?8Zh7Sq4wqNeedbC=eJWbZnfM6j*)ct76yzK5uOqLLW#;-`!gf3f5 zAA!2e09M6i=;5O;>i@485S3IkTJF^PiVwvwJ4{|-wi=cW>n=83r~Z63l%4ahakD@3 zZ0dp7W4+d=9NDpThd_XBR<#gEmkX@$HwjZk`SJhL3tEf$Er69VUhB=s!Q(BP>DQ7f z=;()>!J*s1FYoGd29WRQXa73GW&{pfoDchnlx1we50~vgHgX@R7dZ)wa5+9Y(~$bt znn$D3tb&VvYO_y9K>p4PIr!;4+NIlNW1YTlnJJ92?M(B3Oa7&qfKwBry{?Mw=kyxU z)sfS0`90hAmzTj7aFCe*`rjORjJWsI>Bz?FOHYL2P^8iu{-Gf;%osvZ_MNF**uqv$ zJOUUo<_-vdcCfZLg)nt~rc?>N`_7Dyo`c_A$I6g-C>N^t*`1b1&lm!Dqq-kLu?%_< zejT&wmX(nE#HQHOp~!o#40b|-9reZ2Hn&t8Y%ujCFOdtQfgZ*2@H$n~73*76hmgAZ z^ez%3u7yU;@AxT0IP}bl^500m#4z?bIno@vACs$AadqjqAxw=i8>@|#UhJWS{x^^N z=>KO;6r>UN4UF6e5(HQ36B8i)Yh(mZvdJ%i$XoU2ox>r#YF8JJ_c!&OjpcBrXDA|= zl1`uC$-0(5)y^z*58x{k>(Z{<$*}5^K41!AY4ZFVg9P;p71rHuKESx);_6VRL$dl+ zXI@o(q?^AtolpGV>?Y}%CnEF_^QKBUG0!S|dN|A|EPNj_5YeLzX9hY@{cp9x;fFtf zVa7klct74yVz%;KqQ|%guxbBWam3V8)Ml}ETi)h%cHx>kZ2`7;Dbf5VBu83z<0k}&mEYIerF%6ZyT$s{0F{1)g943{}swb=V z*X*pZ5az5zNg&}4^n^ziY9+{sdDPc9$bifE!G+nnmN%%hlwB62Ov*pB?Z3KT5WU{5 zY5S4Dgp>#E4#jTpKE^XTeECUov~2EN)9eSwJ8cj6Ltg@ZDj=+Zu=1eU@@#j8G9*8u zXjm49KF)n>amOlmxFhx%ffFcL#mw&nr^maahtR-v976bfAY&_uquv15TJe`IeeFyb z0f?)1@jDr*+$b7U=8pgLS~-6YfjW%;_YZ&{34bpS6k&}>>YW*CpmH^!?uZ9agY`&i zk0B*_`Sh=h%L2JGc7sW|J>kE4{BL5%{b3*g*+EaWJG5bCTOT2yWfsaP5snO@5C zgI$@wwIvRGcN9tel^Y5t)h?^&S?AvN5SeH%p>)U&s!e16(ZNyOZWSxYUhtEdI41K zwUHjyyK_Bu=v>^N5I3oc@7<*Y2441JQ}crLGoh zvL$AsrxRK$Q-2DpbXJ1?Hkr1gAy+;|e3jQ#|MhE^yxgrkSixds(vN_V{{Eu^mDKgN zi{FuEJADf0T9nQ1WquXq!(MBNdN)2SBljy3ciq=Rj(#hcWCDW&BO~X*(n2-f1|w0_ zU(LfuwR9D9-G}>t_ zH*UFN9(;PAP}B;ng1qwc^A|tyShNDS6y1k{noU3J=3|M1rwaWlMV<{;j*ov)qmnSz zmD6$DG+ z@~4yrp(0M9s?Oca@yQNsgvvgaRD>vZN!#c?2YH3@0-vor+rEjf z&4H2|C1WuYo3!wrVntnjvHfMvbq^?j-Xo!4oeZRsO9j`by*>KjcR6LnI>-8Dow<0v zS+&|7%KYSee{+9MfBQB@qR)cbaH$Bg_80xzm+tZ96)2D zl;CB*g>QMLxA*`c(?Vm9M-j*cHSHHF?iz4Q>n#OFutoZM=ME`^&aM$+;~al#$*@%Wu{vOJs6L>$dEsVd-?YGtSgExw&IOX{l+Z z4(J3Mc#^~?d$&gc5)xy{Z#S_?oyZEduy$(o;Id~^J*)@T=!mBUO$DxdGplYTaqfDe zn$`i{yMU2!Dp{0WWg>g}WXpSMdeeJ*OAo2|IIzvbR(4X$#^&drH{5N_?WCfZk-VYs z?J)YG9-l~q7jhRKg2&$fFfCG`R` z`CFtko*jv-FschRyYZBT{z6S~>O|rF4;%GQ8r*WEe>Ws;_g4#}*;&+;l$+G+ZVv`B zhc!xDqsOcW@d};j7P`H&PE$nq0(HrxsG0Pz%7ZzBd3%PKBB?9Ah&wO*$b!ejx_k-o zq5XQkRCqe16mf;%9&vWlGFQ3i*3)&vX{*KqO~B(5khIY=VO?_L@=c!PNQ|IX;29Vi zm}{`^dBCdsHM_ZsCmEbU zKH3+LPJn(0Gj70O6vw-9jQ|(=(G=R5?H=KsShg5TS6IClQrp)FD>R{(%DNx0(I%-A zkr{5eo4U8(kC8(U6kBm|N_-%k1~Ql>rfy=V&qH~QyZ1N*wNTziDSuPKnJ%widtQ!sDI8ojIY(9`+kfNx*e`(VR4Se`Ne9>odTU-_A~km z)d24zw-4iXV&(!Gs!^#a88T5N-(&$QJI=-=lM|%a1DYrGQtE|jZaLs^U}g1Zl6y~L zx;$r?CXh4Fro)>P*aCUj!Vk0e_$K)C=aN|E{rTfL-ZEINB2YU{O_ji}gj9u`Zhg`y z?)H;n#jb8@JHMjqsnvanc6|j=On{_L*O(DE%SwY{T{W*<6{pa(N19rPYuj;QL{8c0 zFx%Utv4m?Y#l2m`P#H2UjT?7lcs}k?WHlw)$?^hQ2b#D7)TQMwlN( zTkVgL=9?ZiheSJfL)_Btn5?8=xFP z`&=YDlkspK$IE4rd&1vest+Q36ZP0~UF6)Afe}AXy^v?>i`VIBE}rhuWY2Q1n^haj zD{7mz+=td>r|C`d7ovLgyt~$duD$3xaRy~>DIJcgT2iZ^?~Y(c<8lPF!D0rK`hGia zCoSHT-|1!`Z+L9^7hiYRku`S=>xmU#O}sa5ywPp8v!o#ti{PUTqE^t@8PA1%N6C1b z5%OUCbK1O7g<=Au$xrnjSZSe`(f;Kv)!jER`Hzo|&|3q$UhzDFZvWC)bLc*ZQ;sSj z^mwbA@&aKdww)Z2vrr1SOG+tk)|0rh?UkM+wq z9b%#DB%8CHi;@&n`~LgBZtbVlG^mRxw1yNX zkUD5LO#F7yosImR8$H`js$7lDPrHr=amw>2duiyc-&HE^aQ)ima~EF}COXxR|pXIEiE27QBc*tuX&zDj_&q!vDt#=w5J-bs*q zH@!J1i`i&G4>x_={koSE9OPe}-Gb{KeXO*3KNYx$}eyKGZ_ zJH5IFtcZw40r#R7iEzIPrF5cs3NZrqi$VUBGX*CPaXFlqg>?Yy-7Avd%635}kD#}$ ztsDEGs~27ykn{#=N{;80XAas++S|-WE}}L!{pEAhh*831?Q0eQJ-?&A_+E?qNnkzR zM$l-|z#*kFJ^Ho9bKd*a1U0PdlsB|f<$e5n4R}SoUpYqVR~+q?2Rv-+GPG-!Rfp?} zs8%kqXi(!Pn$Ne77b~dJl-AIM@D(SlUw8AA(UgU7Ta%KEp4RSI`6{QnY+aq=LJuAO zg3t<4u{8Oe?$SG~sI@jZ!`KZrS#KuG98nZTLD;sfMFbayvb>hZ6k1nXVN(6CQLk&c znm1`3|15Wg-k4(MUi)L(ytX!(n<~+mRPB648VK9K7Qe_ceOOX8IXfY6LWvK>Lle=3 zeHL;yPjrH$>uGBKh95F?LXszW3oV{HO&g4q;*i_N=Vp9!5_>E1y{#fjQsEEt5H0ck zig-=5N9K$;Oe~Snn28y+)}m^`5jYv?2pmY^sQdBi;1tgHg?kJH^|$ngr)*=Lu9plN z`v87N+i3?puC*Dvyqbds^cRV1rI5mDpJ%kem_>dXUT!LDusw(d=?IAN@4cJL_bt{; zx2D8&m4=|2vMGZvPwgfNyDzC%9WM4{`c;R>tNAAy6l?)GE2moiEvIGWOeamo zZ2C)Eu4|Fw%x-C2Z~3Hij!%AK>WuR?f)-IVa86cTsh(U`nswhiaYGKs(AwBE5qWj8 zIi|C%4T4SKM8~U4by57ip&=ZoqU~)op6pbex zmwcCwO6HHfGX5<^&_asDFA^smf}f$StGiBBF|vlr43&h{3H`jedE3!hb-hE3xmFU8 zpgy)CGHn0Wz0L3T>Jm!fAu5M?NMe=t%T(rE`}2u}v7UsIk`f!m!*G0xTsbA18o5z% zy>=G0lO}Q!-*UGZ-V$dm@gg4By_@M;kuW(pS2{Ar;1se?pVCjb`zAW`CJLkPY24;` znD!Qxrqq;mocbqf30I@d#jTXOFlST#&|cJ0Yb@9)E`2_uV@z$9-KmFDJ?Ali9%(O&%f?(#mLSl!HU1h)hZ%&V#Oj5Sj>Sc$C`b9RVOPDC5 z2e#T}de5YLNS*onZFlFbbaM-=1mtBfTZK`lx(gpnXBcE6_k@oZhTwp}Z^%uGy7y;) z={7XnK2e@+gVogG)_jfCs85}H-xS6#FOW84{cb_YQaieY-0=~ei=DAndhSt~_g83u z$TQmDkw(DE-0BSPFQ-PELT+vleQs`}NxJ8$q~z`%yzL}t`jOG&rEKrCYjF_6#h~sv zYlqv&@+sf%7bE24W>s(9=N>kgz}mD?3~!mU@%J9~q6V(kRTK7b7av?<{PJINtJ|%wRZ#Au)0-@M$DW!r$QQxv-KIC^-9=!QxIn3 zI3Y%2wVct`_(3NQ?L(scNpHTOlEi82cj4@7xv5tDTB$hU;OzgZntBv9E$!(e6v8Vy zCM}v!hcthcKx+MH9oPh4g{0+VnnR++4y0=Fb-oIW+YP&fN*Sl|6mD9iTCq);uOoIu zc!sZtv&k#?VO*N=)PGW}u1M6aVwtD&U{p-5?fBQa*A{t{zWxG{Lq^JVTcZ~v{ZuK3 z^NxEsbTatLXfN!peB`AAR4U^6%6Ankph9~@z`m?bZ;)FSqL*0XZM|B!t5N<>|b10H&bPB zTEFZxGGlYpvZ8_vPU=>mzbI^EejX6_-%QejN>aM{W^(v6;dk1o)T^8&7kP1BT>L*NdXi8kdwBpVVaBFXx%p ziS}drZg>-Eo62e712)n0Z0+lNcwSJe6C5q^1ZImr>TH#Ssve%~%houRV6vS3Ok#5< z+x-j;2sw85^*~JNSq^;!B}GPsT@(e>{cr;ja$JVNNY;1bs3h{c7hD-~k9Sih8gwgZ=j65Lhy{>}H&qk=+F6OP^N_y~>#BX#@jg!o2(6Y+X@zQ|xoHg2gt4#2QqL zWZ}&*2IS+~A62??+TQ~s$s1226L}}~T!wepPJxw}phKyd9#@6xcWKuMAzA+}ek>s;>kH1HWzyP8yyK;-MzRfa5&X z1lA_WYGa9?Yo>vjPCwm4)O4mMP`(sqvp7^7~9N_edZejxRNE-9(d%ezUz z1vzAN@C~1~o`Z7pzwi8EyWOT?k%hMKTbe&-x%{!Vp;VTfTGV?|{hsJ;A^yixO4d1@ zx_{z+JcJ3`%DxwO&Ey5u$e8K!b5j!wYB|Xwe`x5d(A%6)Gg+w!*#uP;|H<`zyCu$tFK` zmmbfzVFZGEJv_(=Mo)*lbbWK)x+NRvh);K6pkY>vsQK@Z-@1v~(BlMp;q$F2{UJ7% z-;fiL+p25AD_%*IgGn_oC~>Grjb-0xP}tpN5C(h{vdqP+EZ;$tb}*E@10b@PP(((+*N7Mt9hnpo4Qlc0#v~`!CNgk5ApV z^F8b!`snBgO&qOBtU%fqSp92rSUDXZWLUkKLFeU0GE*D|Ge96#c-+ADacG;WvbeJj%%a6@>2xTe|U&ZcTHrH z50Id@fBo5A`CR{i4yCpK$^YPZk2-vQtC@hT+hXs}ZtnkSEDciAX?co?LGSR$uPrlu zignydDmcIdhc$z#jBi~l@@n_e8@-5im3F3!c9=T+oHz+k{}l)X0*Pk9)!%8s$)RR- zwRb@eMuaYFYl!HVR=e61TgA1ZoQH&D#Y4v+4nL>qsuN^n?oDr(1EVQOwqzIX$vtxb_ zcSS2w{`cqVlU>(|@>n%q3>a4THi-Zq7%bLD|6WY5q-%c$XCV-|ygoYI4Azh!~;7#3T?9-O@#lT*J2*Cqx9MA3l<|03{i}o z@v;p~mG9aA`&Z{#VIuOuA&y4!7_4rr?%8^KpEtm<0_zUGM)d#HQM1+t=I7Cuz7-@N zY8Gk(II$hgKzyam{%;15015a8|IqME6jP{~MkS^1)i9~BEngz~e?71H!2MM{$4(=; z8di6Xh2@NJ6AEERM>yJp|0V)UXH5BB6Rf@Un0;OdBQoB7cm6aT&Rs!9`q#u4qoa>% z@4s$&IujEzu-PphwOCU0V{uk%YT)hcyhYI z06)ST{#oQ^@1JV6pQEZjm=0%9K==<)o+w1BX;o^t|L)UWxoN0-{5zhDnBgB}`{Xuroj;t9~xrhIL^~0dA8P72a zH~Z97l@=^zZ$Q-N1Kz)hG(!wH)v$V*USUr=VnRc<{^1`B-OT^yaeP#{nD-v$C2O}6 zjGGiz6jqf+97%Bm-gd%MW%jgPuBR3pth@@ zR#fLRmr8psJ#B~yB?ed>W^9qbj}9tSKEvU&n;zX+EaCD(j5S{qWD*>CJ;6M1-6Ph) zpqBNVuxxsn9|+9VG%y;ii3#P|;9}U+>S!MlhIcR5y9RnM1DBITU&T^ESiUzcbxcsv zcml$GA9mqiDUXQT_7Qd~FNV@P_~HHS%{LZpx!78`%9x9Z31$fBc}GapPY1ZX5cqkH z29L2=(?~`zy74T;TwPqxhYcG)qCLclwA`upIsl#?KKIzit=O-`e)|d?2NXIl#rWY( z?jT6sH0}@9q&}>%~bcqkSsJRqc;W<6-{q;^$&uU#8){C+k!=eFh5;hdzsj@oW1_j$IV`q zG+A2MQvke2lzX7=>kG+**HOID*pP2z?y~IHe|Wcl<8OeS`L{ds!43L64~9}J=IiOg zvCbwg_5zo{=f`K`I*YSxy}v#ITSI;KbH~CMtu&BKolS3KD<=+Nkl9opm{M~)m`tYl zkUdq}ykSC)2T>40Gt5&}RiX4-x2&)%86Wx6#m(+j$_xbV1Hd96;~AQ5hV0ERtM_OK zk}iW<7ke@j11$Z2*Jqu4MW}NpH+z$vCp-jy0{>RB7CO`NL<5r-q!LGq%U$h!-!Gic zjjgHoi9#4s>Z~{Ue^j$MR~V?KH*8{By^z z#U>yk6qGQXnlyKe=F0G$ngb{=u7uuf$0v!7MVo84?;rYin7M3y$`=D-92*g&AUtq^ zyb186>iF6Q>RA(i6BvjI$Ul+kA83l8o$VtM=M;~H4W!J>@h1Y%fU7sYEI{36q&DCY{alt zo`x#0o8~@bR@g&Pz_cCAU;q z{x$B%il&)9X;>hT0?PM*oGpn|7Nx0`P`y4C&p0q&I#T)4Sqrl%h>&Sa(8R@@r=-}C3)(OC2T{)x5E#qCpgdpp{Pm)c$p37(fIyjI77>t*L^92Nve z9w{V@JBrw|Kwliweiu)Z^VCdhF|1(Ef6^r zRc1gi-x(+jC|N&BFAXJgwW?p5_@2(b_PaA|UWnzqZ(O@6htU2)a)>!j=N`AJm}VNoD720;XL`G>*R)S z+YWYv%Qw!BT83-L8H7h%vfQH6bW-+n7heeHz7f0Su+kj<>|Zh3wx-VV?P-S+IWvXJ zxB1%5D;FaIA1--Sn+i@ldg5mDK-nD0OByRfnELEg43+6qoC~pO8Yb=Gj5N))lwp1p zy)D5|gWiFrk95TwdZedMT-XWf^Q<)Q`pn86f1!0tYe#xS!pXnZF<$P<@c5&yvC>u| z^Vo(r6}$gw>9djOhQ*`I?)SmtFVDv}yajGKe_0}ys|Hfj%ek56celIVi@JDUCCVp- zK8qZr=o#}k5N~L=@;wmIJ*bv_KmJO!DMQ018L8rW%VfVWh7I*3W5;J)v}h-E03~J7 zctPoWv_>KJ;=ppVMd#}FvF4#NW7fxmDvS{soIGq0!ItaI*6(IEY7i&A%2~cL{8F6? z)bGCnspC_+9cV-y4hp=@JG@UPWqvpB(4nr%A~PK{M;maz>?44>9Ia|MZS-#mT8)jH zx3xQ{xd~a#b~(RA;h{!dU4}8fR6xu=xy^jkqWDyo08lIZpPP{X{z@m z-5=a0KHjejwK$)-=P$jT*UY4eM53q~>qV0kVzVMe}*18fEp4N-zoqOw0G-;@re^9FmdL?9u8{|N7` zxbd9#vM(;)CxyLGC!1B-OH%2M9kFV~U)9UDa()-OPrZWDs- zDlg1IToxaB#fMsyy8TJT`Yi=w}P{LlZjGEKcivGr%9D{BGY>c~S?g zbON8+_lxX$0XL$;3mm8o9qPQvq?ZiB;KIURfC&eSOWw#8`Hp;hR*jLw=X$%K?XRAi=DC1Q8J^G>M^F$b)p~FRNFa~l5FNH<6raLaQ z6G!+_F0dzE%v>f%MYpW6z=5h8PMGOWmsE{6(0$Rz-^-xKPC^ z?0{f_2_2kIbYC)@g)_Iy5`O|w*F{_F^SaGbm~W_F@6MCc@3W^2gzSu_RcpYEQcu}7 zab;c2yKRTlA4}3n)7cm*)wsCeG@QDhyYjJ}H6cP~ByZ7HEMffRqh}l{py~o7!sAa+ zg>zl>-5vcW>6z|tpP;9+fgED-T8c_uc_ekL=kZeZXbsD*0&Jo#mll=veMGrcw#+Yq zQZ3@xXlLX?;azV|UkPv3aO(3$@U?79z(a2DOA{KCP&+6b%CF?+Uy!+E=`47xE==Iqhws>G=JJ~Klk*xx_v0c5(Z@KY@=Eh=8KR#k9dU$zSc z$&QVCx9trBVY-E#u!^w0%i!uoV`fXsRdKIwN#~|M_@5Vbr3GcY6^mB+JxSP@4$1{P zeP%y5Tp$D+#!b7{Jy>1)@2!9s)%c)cqOuULsr2 z%~$x_ogd;PDq_0f+>cs;S&K)`-Pt$TL;?jHW3HR&m2aJx%utqoZNFjFUEC2Xf7>g4 zS#kc&^s4T+n8rn-cui}9llm;{vCzr=vQb>~?oVv5m zNTX!?=n?6$_zv}Gj5>vh-JgnW;E~=HKSAh304&AXuu?++bj^KC^rdP*?Iz~X=U@Jm zEeet+LTjuw04QJF83Xd$Y zyN>H%S0!CIH3E2f12@P*BzPJEq!@L%MYthr4M& zhL&Kr)2+Qm(ThF@yx{moZQbS=Z!Vyp{7q2Dd?sh=meN~b`!{-*bbG_F4>$(*Jwu(` zrR7sKz|!hkrfWI!)sW$?7(msriock&osWHfn=Tq?<`na`$H2pb?4SUb%ELoC)!W-Yak+j1PJb$YWxEpH9SNR0 zX64BJ_sURdM|8B*J+!)(nMO`<5KJ{5t?VJvB9WF2GE`de+y%kNZrF zb#D`HLDgEf`xOnM_!R{hGw`6I+ zI#dYF3>-kopYP_%cx}`*l7d2Q{MMr{dcfU~3({*8VsO!h1|md(xnw_0)i+HNF8Hit zcJRv5YH(m$ws}FEg!V?|UMgrQA2EjWfb+sp7WjX@hs!;ZD&}R4(KaC`{m--FDGjzR;JW ztLG~>d*W`+P_RS52nClE-u)%<%!KWsj)xVP5#8S%RL*P^GJ&f@z>A!b{zI@&hn^?05ED>!J{ahp=B z;lXD`6M7obrmd-(5h`agFoNlf(6Pk_Vi3B17`d&Rv}PC3jZb5^3F_u)({NI-Ss+4$BZm6*^4EHtB{h&RUY#= zO?|4cC^m<_{J(#sHsaiRF#} zNE$-u-@&bY0NdW?h_>^C$h~w$txC(i6b4udRa4w-J8cGPH#VYCAW`v_#bIXFbS$J* zfgJ8R2UqK;6bcXa(t$xv*o((nONWhf1M51E;OwqfPZwK*m43uP|C%SnCzVOS-x%V% zV`u;_;JDg6lxV9DeSip|7Qfk8x%aU0lj=+cpU7jlQK%Q575uIbHcp#1hKUDBzaPLK zw1Az6yL`1JeF)m-G@}Enz?y7pxO196#eU6&_=YX|U|V%4;(!=(AVH`@ zZtlc0Nju@ermlx<9^^MFENh;^lpODvFXVB|Ct6=DlK=$3;VP$^_oeows<_O>B zjjem$Tk=%#*<2=rvdpyEhnd9qPg>0}_a^#q?YCWgiXZdxqWoz@dB@APDRB|8@Z#?=35TFR?N9?7aTJTHKiZ_=|=aV zz|7CP8|xBAf{ecSJng(WlYIX*TIM|tUOIBm+(-fMV&2ZwuS<(1Tq}as%3l<-`|NaB zJ_CKW8tL&FmrA)LWlyFMmghex4h-IXMnR`}&*va;XOk*~@!D8MWR=VzrRv+>sW0zs zd9eG>7uLN$zr1kHF1LN2C_&sBB`*v~V=9?E+i1dmqC1Ebf+J1J<$B=1E0!CXK07Mz zbu_Ym>bP_nn!muco!fD=zfo%`CMtMJ_N4Ue!aD%#Ne@Dnvs*I0eATDWAKmb$nS43P zjta&)&YmQzHaVb;ZB|u80v9Hmmf8*fVWK~mGLr%7pID?JILHGPl!Cu9pE#k?5*Cf>c+e`?1lQoPIzEAUhzJD}1_d(oU!{W{83X|H_Qnu^4FUbtY zsl47D(2oMNGfR zd~?+(D5IjiSL@Rbez8g;0y)P+Dk&r`=p{Nbu^ zp|iSX?KsF9utAqr4To(knT7A45bk7|ACD`)*C}H}r$4c|u>}Zm4wvGd?j=plp>M1o>DoR<1C{?8(1Pl zesDWurnA}J~;mr^OHisJ{}HYLqf?VQx_i8!2IAz5-Z zd~tY3y|~X|dTpEQ>(}G`F0d!~iXgXF_lxDr#2LFYTYu`tNrzUQKjj<78l_KmGd)0lgCOwhg%YA}(zFxY}@=tKC4>5JwC9gT25tRt7O}=gptl8(-S@+0{#Y za-rypdmP@AW-%N1iC-eabB;2%qBXP^8CZF{$I&0L z`pc`;x|LG{4+%*_oS&AnX+HP1X3gLR1KDwV0iPe}WQ*B2)JqFNzCEO>9E=6Z0|HH% z`ky$V5bhL>fETMchH>!;TD7kJ3q?KVSI|AFLARxv7V!ATc>8*tZaL##7N80z@m}Ls zCZjFr_l*(1Z3^C!hCmYV4CDOTUS8U&@-Qx?2u#%VJ_S9dBI4e%tVURXjsQ_E{gPvw zzsoM8njzllAoEQ*&T`F{5*Y%iNhMcRWU~}QFl(SOuYXc|1Iv64uh6?5^BTpNx`T|UHjB1WYLOME zUpCm%En;ySgP!$Xo!!&iI;yMCz(i%G+XDqEe=okaluDtxkRcurB@~E8A&kM4GLQ3Gal2mpc4{} zGtxU8_qSy3#M4KfBI!1}Kp?|0I?jbVuA*+7mgm{uhCj}rf2+?8{qQ>D^-F*tHtE!& zhr=+a#NNa>Q>o>nqd>CaKnUfjJ1~`|6`O?~UUB_{qxXqPCUX`oQ+d)W+h3f&%=4LMN{frK(c zpHLaeh{kyi40>T#PBH^Sc{2QIyPtetTFe&;%J3nXwGA{SX zP~zhgGV@a(K&Y=mz(--VXmW=|`ZxRUz336DVyvg*>lr=n!@B%d{4L6Ho{A2koKz&| z_9)HP;BzSFs2D%sSGC)ei%)xqd#KyHcBXxKdv|E%6W7gRW4O-#{MmKfr;${uUG|2# zd`!0?2!yZ9NKyA}BPLb9v!Sml9VJ<;BJz|fz8K4HnSVcX@^L7Kg5sdnpFd+yzu$hA zZ@_>^{yY}a-Ut~aEPH19wkE8L{z1bF{J98QRlRWMvgXVI zKT3>Ifat8%L0h}bZP+)4JXbP2s&u9?-{H`SJMAchxG{Yv*j*^?QM22Dfl6w;{!rm- z$BT(Zo#U+#Ffe|?SUT6OdW-LlRH*i{)os2XBY*qRjipm&*XJF;TiHCHaXSEOe|8-| z?IZeU#>6;2F+J(3VFc_DLtj~0`so~E8q#bSRj8aDBwM15OOUevgnqA6a4e1NH$=oM zA-Vo(=mFNScC!gQWcN34CMOnjewKgn{c{(?6>bYc(Pv$P53+ugQ2~^W4;9$U<9PNj zE7&)97{C=4vU*>$Z7b^R7V1yVQzZ)Ch5weaXghi^0YdOY6&r_IPDb!Z>Dp9^>%H0h z=Z>rrux)>5n6VR7s@4=g2>*v)JJdy?Wx+PckqG_V0HrTj#1hgzjdeDa z+q4A105*lT%G^}B7OWfpW7M(ki|N9h=w`8YU%9z6yW+?`&;lEdtISnNpKbNf|(?oSYe<}#*zzGKT3t7B~UYh|FWI(#Gfv;V_&6}mC7 z;XhPVwohSK?Elq{#KmRd+c|>`P2=>iw0$*oob)PE86t509-xNZ8F`5BN$3yQPA|WkFbXwipdt zG~FYIYlD0elm~|O2a#I=Uw3El2W6MSyU}xE=n{vg!Jce*(Rt-{-)34jooug~!I40<>RAruVb44IAEu-Y>B=y$ z;R~4UzqXs z@?O29;M;G|a0>?e!lrq{K;(zc&VbqvyD|-fjU@s}=;M!q-jybm{eQ(>`y-QWA776g zGLKP{Ly0%)S#3F#9%{v&%5j?WaZOrNQeuQzV!Y~z#JX#I%OuN zBw^4CJmDr-)Y?i0a*}FIMP7XD8{#|Y@agu=s6d^@jc)kB2phOlC>SnM20cP^oJi z8up|zUIR^j<6qwumpF{Zxyis_8f?_3xE<w|J74TIU9GMg2BzOiUzhuvJb_o0N zTxhei->L`$Tn|{QdXZwvvd#R6h~FLE!GL-^@R3e&?Tr@(*4tcC3)W-pfaii^oS##= z6xd4KGnAL%Zp%Hd3hq7d2KVfnL=7`l)G_4ux0`@BqcomY8+To(>5k{%x%#|?rbj0L za}v47DIiV5w!D}V%5t-7eD|5_AMtP9!hap6(mACA89%fj@Rt^%)=-fhr9v!(Ao;0D@)M>rS?$d`QGvq+H2CuJhwP-PaxJ_G+W~yNH3VaElD~4-bGAV^44WGhCmH=qtQaLYWRpcF$|Vg=>^c%!s$_8 zgQ)M~^*W5v1>hk8ecajZ7Y=Vm7(!T7y!3C;h6?gzeL#Y|X0Jd84h9`?2!e3$Wu-w* zJixfB*+x@1RUxqIXy2bg36G7HZUwNfe032Ccsrd%V|8~PJW^$Sy7FcwXOj%ZKs0zz*IcobxLxb*dFstCl+P#=!&zFf7QUE{srt#RFfbg7q= zHV0Bw!yW!{`S5cuu}y$3IbBOOa4giHNZVqP9wB-9qjQJ-vgcbx`BjUfXT9GEYLdam z{U<-dlFEXD20Wxv(&CJY%cmqS(viaH#zJ5A#%!QfMtYA$b5ZbRC?@CHA`l^v)e_3H zypH<5BP%u;^N52F`seN6PL1hcxk`^JqV2_*Kpm_-_vv$*$u)0vF82zTQp8tRlN8Dt z04u}p$kCxZ3kkpFpFLVm;@k_ed0XmxF*QS9ErA4IwP1IQ{0RcBbgRwOxNcOF{Y$Uy zj=gG>-so7D&8={_-y7uNI^gM!1~E~ar?=eN-B)tYHH24-o#(0>&*MJNj&buK*0yro zsih%!cb5pAjti;VL0&|73?7yP z510hZTK_rRW06Oq)C`qlL*}WJ&acvIO{aaH&n{SWU zulRZccvE)H#mf1E-S23;=qOyK#?Ztti{-2k9@`fyp=%&x2Y5GCyur?>0 JsyyKx^IzS*wTA!z diff --git a/notebooks/end2end_example/bnn-pynq/verification.svg b/notebooks/end2end_example/bnn-pynq/verification.svg new file mode 100755 index 0000000000..9c6c4b91a4 --- /dev/null +++ b/notebooks/end2end_example/bnn-pynq/verification.svg @@ -0,0 +1 @@ + \ No newline at end of file From 561e69be7b6c2ce000058ce802beea235c88bee9 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 11 Mar 2024 13:23:25 +0000 Subject: [PATCH 222/291] [NBs] Fix linting for verification svg --- notebooks/end2end_example/bnn-pynq/verification.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/end2end_example/bnn-pynq/verification.svg b/notebooks/end2end_example/bnn-pynq/verification.svg index 9c6c4b91a4..9cf8e86088 100755 --- a/notebooks/end2end_example/bnn-pynq/verification.svg +++ b/notebooks/end2end_example/bnn-pynq/verification.svg @@ -1 +1 @@ - \ No newline at end of file + From c4aa418ef13ca87f13f5520e051a4baa5b857c72 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 12 Mar 2024 10:56:57 +0000 Subject: [PATCH 223/291] [mvu]: updated comments and removed mvu_vvu_lut module --- finn-rtllib/mvu/mvu_vvu_axi.sv | 9 +-------- finn-rtllib/mvu/mvu_vvu_axi_wrapper.v | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index d7b16319c8..2a7403b6b3 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -57,7 +57,7 @@ module mvu_vvu_axi #( int unsigned ACCU_WIDTH, bit SIGNED_ACTIVATIONS = 0, - bit PUMPED_COMPUTE = 0, // requires an even SIMD % 2 == 0 + bit PUMPED_COMPUTE = 0, bit FORCE_BEHAVIORAL = 0, bit M_REG_LUT = 1, @@ -319,13 +319,6 @@ module mvu_vvu_axi #( .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), .vld(dsp_vld), .p(dsp_p) ); - "mvu_vvu_lut": - mvu_vvu_lut #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), - .WEIGHT_WIDTH(WEIGHT_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .M_REG(M_REG_LUT)) core ( - .clk(dsp_clk), .rst, .en(dsp_en), - .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), - .vld(dsp_vld), .p(dsp_p) - ); default: initial begin $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); $finish; diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v index 936f2ce0fc..50c15c1b02 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -81,7 +81,7 @@ mvu_vvu_axi #( .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) ) inst ( .ap_clk(ap_clk), - .ap_clk2x(1'b0), + .ap_clk2x(1'b0), // wired to ground since double-pumped compute not enabled through FINN for now .ap_rst_n(ap_rst_n), .s_axis_weights_tdata(weights_V_TDATA), .s_axis_weights_tvalid(weights_V_TVALID), From 07ac1c9273c38a56c1e6d1bc0f144dda68dcf004 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 12 Mar 2024 13:14:22 +0000 Subject: [PATCH 224/291] [Thresholding] Add NC case to HW op execution fct --- src/finn/custom_op/fpgadataflow/thresholding.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py index 822bb1476f..dde813a293 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding.py +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -242,9 +242,16 @@ def execute_node(self, context, graph): node = self.onnx_node inp_values = context[node.input[0]] th_val = context[node.input[1]] - - y = multithreshold(np.transpose(inp_values, (0, 3, 1, 2)), th_val) - y = y.transpose(0, 2, 3, 1) + # MT expects inputs to be in the shape (N,C,H,W) or (N, C) + # if 4D then input values in context are (N,H,W,C) and need to + # be transposed. + # if 2D then inputs can be passed directly to MT function + is_4d = len(inp_values.shape) == 4 + if is_4d: + inp_values = np.transpose(inp_values, (0, 3, 1, 2)) + y = multithreshold(inp_values, th_val) + if is_4d: + y = y.transpose(0, 2, 3, 1) act = DataType[self.get_nodeattr("outputDataType")] if act == DataType["BIPOLAR"]: # binary to bipolar From 68ea1106214921d7ddbd5626548037309ac135c6 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 12 Mar 2024 17:25:07 +0000 Subject: [PATCH 225/291] [NBs] Update cnv end2end and advanced builder settings notebook --- .../4_advanced_builder_settings.ipynb | 327 ++++++++++++++++-- .../advanced/cnv-w2a2_folding_config.json | 79 +++++ .../bnn-pynq/cnv_end2end_example.ipynb | 42 +-- 3 files changed, 401 insertions(+), 47 deletions(-) create mode 100644 notebooks/advanced/cnv-w2a2_folding_config.json diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb index d9db2c2bc1..dccac6195d 100644 --- a/notebooks/advanced/4_advanced_builder_settings.ipynb +++ b/notebooks/advanced/4_advanced_builder_settings.ipynb @@ -228,7 +228,7 @@ "metadata": {}, "outputs": [], "source": [ - "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/step_convert_to_hw.onnx\", localhost_url=\"xirxlabs60\")" + "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/step_convert_to_hw.onnx\")" ] }, { @@ -635,7 +635,7 @@ "id": "8fd0af6b", "metadata": {}, "source": [ - "The model contains now a `Thresholding` layer in the beginning and a `LabelSelect_Batch` layer at the end. Please note, that there is still a `Transpose` node as the first layer of the graph, but we can solve this by converting the input data to the NHWC format before streaming it into the FINN accelerator." + "The model contains now a `Thresholding` layer in the beginning and a `LabelSelect` layer at the end. Please note, that there is still a `Transpose` node as the first layer of the graph, but we can solve this by converting the input data to the NHWC format before streaming it into the FINN accelerator." ] }, { @@ -646,6 +646,289 @@ "## Specialize layers configuration json " ] }, + { + "cell_type": "markdown", + "id": "4ae83d6e-c704-4c7f-a922-a4b470c0a55f", + "metadata": {}, + "source": [ + "The FINN compiler was developed with the assumption that the hardware blocks corresponding to the neural network layers are developed based on HLS. Although we do not want to abolish this HLS implementation at this time, it has become apparent over the years that for certain modules it makes sense to implement them in RTL. This allows us greater control over the resulting hardware and we can make optimal use of FPGA resources.\n" + ] + }, + { + "cell_type": "markdown", + "id": "ed72aabf-0517-422f-a686-6c70e7492114", + "metadata": {}, + "source": [ + "So, with the growth of more and more RTL variants of common FINN hardware building blocks, we introduced an additional builder step called `step_specialize_layers`. In this step HW nodes get specialized to either an HLS or RTL variant of the node. " + ] + }, + { + "cell_type": "markdown", + "id": "82a2bc39-8a37-49aa-a79d-2818e66ebd11", + "metadata": {}, + "source": [ + "They get converted either based on pre-determined rules or the user provides a configuration file which contains the desired setting. If the user preference cannot be fulfilled, a warning will be printed and the implementation style will be set to a default. " + ] + }, + { + "cell_type": "markdown", + "id": "bc90b589-7a92-4996-9704-02736ac4e60e", + "metadata": {}, + "source": [ + "The builder flow step before `step_specialize_layers` generates a template json file to set the preferred implementation style per layer. We can copy it from one of the previous runs to this folder and manipulate it to pass it to a new build." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddb88eb1-3f11-4343-ae7c-3e5e8cbc34dc", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open(build_dir+\"/output_pre_and_post_proc/template_specialize_layers_config.json\", 'r') as json_file:\n", + " specialize_layers_config = json.load(json_file)\n", + "\n", + "print(json.dumps(specialize_layers_config, indent=1))" + ] + }, + { + "cell_type": "markdown", + "id": "158d7d8c-a072-4a50-9714-43ebaefa53d1", + "metadata": {}, + "source": [ + "As you can see, each node is listed in the .json file and an empty string for the node attribute `preferred_impl_style` is instantiated by default. We can now use this .json and set the `preferred_impl_style` to pass to a new builder flow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f464d35-6774-4751-80b4-b6230e501539", + "metadata": {}, + "outputs": [], + "source": [ + "with open(build_dir+\"/output_pre_and_post_proc/template_specialize_layers_config.json\", 'r') as json_file:\n", + " specialize_layers_config = json.load(json_file)\n", + "\n", + "# Set all preferred_impl_style to all HLS\n", + "for key in specialize_layers_config:\n", + " if \"preferred_impl_style\" in specialize_layers_config[key]:\n", + " specialize_layers_config[key][\"preferred_impl_style\"] = \"hls\" \n", + "# Save as .json \n", + "with open(\"specialize_layers_all_hls.json\", \"w\") as jsonFile:\n", + " json.dump(specialize_layers_config, jsonFile)\n", + " \n", + "# Set SWG to RTL variant\n", + "for key in specialize_layers_config:\n", + " if \"preferred_impl_style\" in specialize_layers_config[key]:\n", + " if key.startswith(\"ConvolutionInputGenerator\"):\n", + " specialize_layers_config[key][\"preferred_impl_style\"] = \"rtl\"\n", + " else:\n", + " specialize_layers_config[key][\"preferred_impl_style\"] = \"hls\" \n", + "# Save as .json \n", + "with open(\"specialize_layers_swg_rtl.json\", \"w\") as jsonFile:\n", + " json.dump(specialize_layers_config, jsonFile)" + ] + }, + { + "cell_type": "markdown", + "id": "52592ea6-cd12-46b9-af91-5960b4749e7e", + "metadata": {}, + "source": [ + "We created two `specialize_layers_config_files`:\n", + "* One which sets all layers to `\"hls\"`\n", + "* One that sets `preferred_impl_style` for the ConvolutionInputGenerator to `\"rtl\"`" + ] + }, + { + "cell_type": "markdown", + "id": "701905d8-c5cc-4cc0-b872-156c5b9d0432", + "metadata": {}, + "source": [ + "In the following we will setup two build flows and run them to the estimate reports step. Afterwards we will investigate the intermediate .onnx files and compare the two runs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22ff1a91-7ef7-44cb-86d3-60b9af7a8c5e", + "metadata": {}, + "outputs": [], + "source": [ + "## Build flow with custom folding configuration\n", + "## specialize_layers_config_file = \"specialize_layers_all_hls.json\"\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_all_hls\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_steps,\n", + " specialize_layers_config_file = \"specialize_layers_all_hls.json\",\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9df41ff-ef6a-4d0e-ab36-241bb11ed241", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff617f21-6001-4bb7-9cf7-2cc2acd3fbec", + "metadata": {}, + "outputs": [], + "source": [ + "## Build flow with custom folding configuration\n", + "## specialize_layers_config_file = \"specialize_layers_swg_rtl.json\"\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_swg_rtl\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_steps,\n", + " specialize_layers_config_file = \"specialize_layers_swg_rtl.json\",\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f48ba95-f7b5-455b-8041-25b7341ad115", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "markdown", + "id": "bed4bedd-397d-4bd1-8531-c6ceac306715", + "metadata": {}, + "source": [ + "First we are looking into the intermediate model after `step_create_dataflow_partition` and then after `step_specialize_layers`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e64db23-98cb-494b-851f-3cc2c3847451", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_all_hls/intermediate_models/step_create_dataflow_partition.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "3e1a6351-367f-47a6-b802-a2613ea455a1", + "metadata": {}, + "source": [ + "Let's have a look first at the model which we specialize to \"all HLS\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f85d6c42-153d-4a40-b3cc-a4c8c89fe636", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_all_hls/intermediate_models/step_specialize_layers.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "e1520920-b7de-42a5-9ec8-e8503992fbd1", + "metadata": {}, + "source": [ + "As you can see, each op type has now a suffix indicating that it is an HLS variant of the node. Additionally, when you click on one of the node in the Netron visualization, you can see that module is set to `finn.custom_op.fpgadataflow.hls`.\n", + "\n", + "Let's now have a look at the model in which we specialized the ConvolutionInputGenerator to `\"rtl\"`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f1f26a0-3a62-4920-bf40-5b1b798fa02e", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_swg_rtl/intermediate_models/step_specialize_layers.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "3f9c4de4-61ef-4698-ab23-87bf5953c5ae", + "metadata": {}, + "source": [ + "You can use the cells above to try out different settings and pass it to the builder flow. Please note that not all layers have HLS and RTL variants, so it might be that the setting you define in `specialize_layers_config.json` gets ignored and a sensible default is set instead. The FINN compiler will display a warning in this case." + ] + }, { "cell_type": "markdown", "id": "5ffbadd1", @@ -950,7 +1233,7 @@ "metadata": {}, "outputs": [], "source": [ - "showInNetron(build_dir+\"/output_all_bram/intermediate_models/step_generate_estimate_reports.onnx\", localhost_url=\"xirxlabs60\")" + "showInNetron(build_dir+\"/output_all_bram/intermediate_models/step_generate_estimate_reports.onnx\")" ] }, { @@ -1444,17 +1727,15 @@ "id": "ffa2a352", "metadata": {}, "source": [ - "For an optimized design, we download the folding configuration for cnv-w2a2 on the Pynq-Z1 board from [finn-examples](https://github.com/Xilinx/finn-examples). And will pass it to the build flow. Please also note below that we now pass the board as argument to the builder (`board = \"Pynq-Z1\"`) instead of just the fpga part. This time we will select all possible outputs to generate. Please be aware that running the full build might take a few hours." + "For an optimized design, we saved a local copy of the folding configuration for cnv-w2a2 on the Pynq-Z1 board from [finn-examples](https://github.com/Xilinx/finn-examples) in this folder. And will pass it to the build flow. Please also note below that we now pass the board as argument to the builder (`board = \"Pynq-Z1\"`) instead of just the fpga part. This time we will select all possible outputs to generate. Please be aware that running the full build might take a few hours." ] }, { - "cell_type": "code", - "execution_count": null, - "id": "765e5ee7", + "cell_type": "markdown", + "id": "8d1b041f-027c-444e-81ac-98ce9b6d1b51", "metadata": {}, - "outputs": [], "source": [ - "!wget https://raw.githubusercontent.com/Xilinx/finn-examples/main/build/bnn-pynq/folding_config/cnv-w2a2_folding_config.json" + "Note that we set one additional argument: `default_swg_exception = True`. This is done because this example is customized to fit on the Pynq-Z1 board, to optimize the resources we remove FIFOs between SWGs and MVAUs manually to avoid unnecessary buffering." ] }, { @@ -1506,13 +1787,15 @@ "]\n", "\n", "cfg_build = build.DataflowBuildConfig(\n", - " output_dir = output_dir,\n", - " mvau_wwidth_max = 80,\n", - " synth_clk_period_ns = 10.0,\n", - " folding_config_file = \"cnv-w2a2_folding_config.json\",\n", - " board = \"Pynq-Z1\",\n", - " shell_flow_type = build_cfg.ShellFlowType.VIVADO_ZYNQ,\n", - " steps = build_steps,\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " synth_clk_period_ns = 10.0,\n", + " #specialize_layers_config_file = \"specialize_layers_all_hls.json\",\n", + " folding_config_file = \"cnv-w2a2_folding_config.json\",\n", + " board = \"Pynq-Z1\",\n", + " shell_flow_type = build_cfg.ShellFlowType.VIVADO_ZYNQ,\n", + " steps = build_steps,\n", + " default_swg_exception = True,\n", " generate_outputs=[\n", " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", " build_cfg.DataflowOutputType.STITCHED_IP,\n", @@ -1532,17 +1815,9 @@ "metadata": {}, "outputs": [], "source": [ - "%%time\n", - "build.build_dataflow_cfg(model_file, cfg_build);" + "#%%time\n", + "#build.build_dataflow_cfg(model_file, cfg_build);" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3eccb045-13b8-410b-bfcb-9e9c7146a1b4", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/notebooks/advanced/cnv-w2a2_folding_config.json b/notebooks/advanced/cnv-w2a2_folding_config.json new file mode 100644 index 0000000000..68409ff695 --- /dev/null +++ b/notebooks/advanced/cnv-w2a2_folding_config.json @@ -0,0 +1,79 @@ +{ + "Defaults": {}, + "Thresholding_hls_0": { + "PE": 1, + "ram_style": "distributed" + }, + "ConvolutionInputGenerator_rtl_0": { + "SIMD": 3, + "ram_style": "distributed" + }, + "MVAU_hls_0": { + "PE": 8, + "SIMD": 3, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_1": { + "SIMD": 16, + "ram_style": "distributed" + }, + "MVAU_hls_1": { + "PE": 16, + "SIMD": 16, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_2": { + "SIMD": 16, + "ram_style": "distributed" + }, + "MVAU_hls_2": { + "PE": 8, + "SIMD": 16, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_3": { + "SIMD": 16, + "ram_style": "distributed" + }, + "MVAU_hls_3": { + "PE": 8, + "SIMD": 16, + "ram_style": "block" + }, + "ConvolutionInputGenerator_rtl_4": { + "SIMD": 8, + "ram_style": "distributed" + }, + "MVAU_hls_4": { + "PE": 4, + "SIMD": 8, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_5": { + "SIMD": 8, + "ram_style": "distributed" + }, + "MVAU_hls_5": { + "PE": 1, + "SIMD": 8, + "ram_style": "auto" + }, + "MVAU_hls_6": { + "PE": 1, + "SIMD": 2, + "ram_style": "distributed" + }, + "MVAU_hls_7": { + "PE": 2, + "SIMD": 2, + "ram_style": "block" + }, + "MVAU_hls_8": { + "PE": 5, + "SIMD": 1, + "ram_style": "distributed" + }, + "LabelSelect_hls_0": { + "PE": 1 + } +} diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb index 9e9d52e476..3141d54ddf 100644 --- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb @@ -46,8 +46,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n", - "There is an additional section for functional verification (red section) on the left side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n", + "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) to bring the network into a form in which each layer can be represented by either a Vitis HLS function or a Verilog module. The model then gets passed to Vivado IPI stitching (orange section), and finally a PYNQ overlay bitfile is built and can be tested on a PYNQ board (yellow section).\n", + "There is an additional section for functional verification (red section) on the right side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n", "\n", "\n", "We will use the helper function `showInNetron` to show the ONNX model at the current transformation step. The Netron displays are interactive, but they only work when running the notebook actively and not on GitHub (i.e. if you are viewing this on GitHub you'll only see blank squares)." @@ -207,7 +207,7 @@ "\n", "![](cnv-mp-fc.png)\n", "\n", - "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU), but now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vitis HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib).\n", + "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU) or sometimes called matrix-vector-activation unit (MVAU). But now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vitis HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib) and/or as RTL modules in [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib).\n", "\n", "\n", "To target this kind of hardware architecture with our network we'll apply a convolution lowering transformation, in addition to streamlining. You may recall the *streamlining transformation* that we applied to the TFC-w1a1 network, which is a series of mathematical simplifications that allow us to get rid of floating point scaling operations by implementing few-bit activations as thresholding operations. \n", @@ -252,7 +252,7 @@ "\n", "* `Streamline` moves floating point scaling and addition operations closer to the input of the nearest thresholding activation and absorbs them into thresholds\n", "* `LowerConvsToMatMul` converts ONNX `Conv` nodes into sequences of `Im2Col, MatMul` nodes as discussed above. `Im2Col` is a custom FINN ONNX high-level node type that implements the sliding window operator.\n", - "* `MakeMaxPoolNHWC` and `AbsorbTransposeIntoMultiThreshold` convert the *data layout* of the network into the NHWC data layout that finn-hlslib primitives use. NCHW means the tensor dimensions are ordered as `(N : batch, H : height, W : width, C : channels)` (assuming 2D images). The ONNX standard ops normally use the NCHW layout, but the ONNX intermediate representation itself does not dictate any data layout.\n", + "* `MakeMaxPoolNHWC` and `AbsorbTransposeIntoMultiThreshold` convert the *data layout* of the network into the NHWC data layout that finn-hlslib and finn-rtllib primitives use. NCHW means the tensor dimensions are ordered as `(N : batch, H : height, W : width, C : channels)` (assuming 2D images). The ONNX standard ops normally use the NCHW layout, but the ONNX intermediate representation itself does not dictate any data layout.\n", "* You may recall `ConvertBipolarMatMulToXnorPopcount` from the TFC-w1a1 example, which is needed to implement bipolar-by-bipolar (w1a1) networks correctly using finn-hlslib.\n", "\n", "Let's visualize the streamlined and lowered network with Netron. Observe how all the `Conv` nodes have turned into pairs of `Im2Col, MatMul` nodes, and many nodes including `BatchNorm, Mul, Add` nodes have disappeared and replaced with `MultiThreshold` nodes." @@ -271,9 +271,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Partitioning, Conversion to HLS Layers and Folding\n", + "## 3. Partitioning, Conversion to HW Layers and Folding\n", "\n", - "The next steps will be (again) very similar to what we did for the TFC-w1a1 network. We'll first convert the layers that we can put into the FPGA into their HLS equivalents and separate them out into a *dataflow partition*:\n" + "The next steps will be (again) very similar to what we did for the TFC-w1a1 network. We'll first convert the layers that we can put into the FPGA into their HW equivalents, separate them out into a *dataflow partition* and specialize them to HLS variants:\n" ] }, { @@ -282,27 +282,25 @@ "metadata": {}, "outputs": [], "source": [ - "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n", + "import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw\n", "from finn.transformation.fpgadataflow.create_dataflow_partition import (\n", " CreateDataflowPartition,\n", ")\n", "from finn.transformation.move_reshape import RemoveCNVtoFCFlatten\n", + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", "from qonnx.custom_op.registry import getCustomOp\n", "from qonnx.transformation.infer_data_layouts import InferDataLayouts\n", "\n", - "# choose the memory mode for the MVTU units, decoupled or const\n", - "mem_mode = \"decoupled\"\n", - "\n", "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_streamlined.onnx\")\n", - "model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))\n", - "model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))\n", + "model = model.transform(to_hw.InferBinaryMatrixVectorActivation())\n", + "model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())\n", "# TopK to LabelSelect\n", - "model = model.transform(to_hls.InferLabelSelectLayer())\n", + "model = model.transform(to_hw.InferLabelSelectLayer())\n", "# input quantization (if any) to standalone thresholding\n", - "model = model.transform(to_hls.InferThresholdingLayer())\n", - "model = model.transform(to_hls.InferConvInpGen())\n", - "model = model.transform(to_hls.InferStreamingMaxPool())\n", - "# get rid of Reshape(-1, 1) operation between hlslib nodes\n", + "model = model.transform(to_hw.InferThresholdingLayer())\n", + "model = model.transform(to_hw.InferConvInpGen())\n", + "model = model.transform(to_hw.InferStreamingMaxPool())\n", + "# get rid of Reshape(-1, 1) operation between hw nodes\n", "model = model.transform(RemoveCNVtoFCFlatten())\n", "# get rid of Tranpose -> Tranpose identity seq\n", "model = model.transform(absorb.AbsorbConsecutiveTransposes())\n", @@ -314,7 +312,9 @@ "sdp_node = getCustomOp(sdp_node)\n", "dataflow_model_filename = sdp_node.get_nodeattr(\"model\")\n", "# save the dataflow partition with a different name for easier access\n", + "# and specialize the layers to HLS variants\n", "dataflow_model = ModelWrapper(dataflow_model_filename)\n", + "dataflow_model = dataflow_model.transform(SpecializeLayers())\n", "dataflow_model.save(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")" ] }, @@ -322,7 +322,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Notice the additional `RemoveCNVtoFCFlatten` transformation that was not used for TFC-w1a1. In the last Netron visualization you may have noticed a `Reshape` operation towards the end of the network where the convolutional part of the network ends and the fully-connected layers started. That `Reshape` is essentialy a tensor flattening operation, which we can remove for the purposes of hardware implementation. We can examine the contents of the dataflow partition with Netron, and observe the `ConvolutionInputGenerator`, `MatrixVectorActivation` and `StreamingMaxPool_Batch` nodes that implement the sliding window, matrix multiply and maxpool operations in hlslib. *Note that the MatrixVectorActivation instances following the ConvolutionInputGenerator nodes are really implementing the convolutions, despite the name. The final three MatrixVectorActivation instances implement actual FC layers.*" + "Notice the additional `RemoveCNVtoFCFlatten` transformation that was not used for TFC-w1a1. In the last Netron visualization you may have noticed a `Reshape` operation towards the end of the network where the convolutional part of the network ends and the fully-connected layers started. That `Reshape` is essentialy a tensor flattening operation, which we can remove for the purposes of hardware implementation. We can examine the contents of the dataflow partition with Netron, and observe the `ConvolutionInputGenerator`, `MatrixVectorActivation` and `StreamingMaxPool_Batch` nodes that implement the sliding window, matrix multiply and maxpool operations. *Note that the MatrixVectorActivation instances following the ConvolutionInputGenerator nodes are really implementing the convolutions, despite the name. The final three MatrixVectorActivation instances implement actual FC layers.*" ] }, { @@ -364,7 +364,7 @@ "outputs": [], "source": [ "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")\n", - "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "fc_layers = model.get_nodes_by_op_type(\"MVAU_hls\")\n", "# each tuple is (PE, SIMD, in_fifo_depth) for a layer\n", "folding = [\n", " (16, 3, [128]),\n", @@ -384,7 +384,7 @@ " fcl_inst.set_nodeattr(\"inFIFODepths\", ififodepth)\n", "\n", "# use same SIMD values for the sliding window operators\n", - "swg_layers = model.get_nodes_by_op_type(\"ConvolutionInputGenerator\")\n", + "swg_layers = model.get_nodes_by_op_type(\"ConvolutionInputGenerator_rtl\")\n", "for i in range(len(swg_layers)):\n", " swg_inst = getCustomOp(swg_layers[i])\n", " simd = folding[i][1]\n", @@ -398,7 +398,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Below we visualize in Netron to observe the `StreamingDataWidthConverter` and `StreamingFIFO` nodes that have been inserted into graph, as well as the folding factors in the `PE` and `SIMD` attributes of each `MatrixVectorActivation`." + "Below we visualize in Netron to observe the folding factors in the `PE` and `SIMD` attributes of each `MVAU_hls`." ] }, { From 9aab2a46ae76c5ae176ec9562afb95a97b58ca74 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 14 Mar 2024 09:25:15 +0000 Subject: [PATCH 226/291] [Docs] Update auto generated docs files --- docs/finn/conf.py | 2 +- docs/finn/source_code/finn.analysis.rst | 8 + .../finn.custom_op.fpgadataflow.hls.rst | 184 ++++++++++++++++++ .../finn.custom_op.fpgadataflow.rst | 153 +++++++-------- .../finn.custom_op.fpgadataflow.rtl.rst | 46 +++++ .../finn.transformation.fpgadataflow.rst | 41 ++-- docs/finn/source_code/finn.transformation.rst | 76 +++++++- docs/finn/source_code/finn.util.rst | 43 +++- docs/finn/verification.rst | 2 +- docs/requirements.txt | 3 + requirements.txt | 2 +- 11 files changed, 447 insertions(+), 113 deletions(-) create mode 100644 docs/finn/source_code/finn.custom_op.fpgadataflow.hls.rst create mode 100644 docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst diff --git a/docs/finn/conf.py b/docs/finn/conf.py index 47ba99fb5f..a4416706c2 100644 --- a/docs/finn/conf.py +++ b/docs/finn/conf.py @@ -19,7 +19,7 @@ # -- Project information ----------------------------------------------------- project = "FINN" -copyright = "2020, Xilinx" +copyright = "2020-2022, Xilinx, 2022-2024, AMD" author = "Y. Umuroglu and J. Petri-Koenig" diff --git a/docs/finn/source_code/finn.analysis.rst b/docs/finn/source_code/finn.analysis.rst index f2321dbee7..d97c04eb62 100644 --- a/docs/finn/source_code/finn.analysis.rst +++ b/docs/finn/source_code/finn.analysis.rst @@ -31,6 +31,14 @@ qonnx.analysis.inference\_cost :undoc-members: :show-inheritance: +qonnx.analysis.tensor\_stats +----------------------------- + +.. automodule:: qonnx.analysis.tensor_stats + :members: + :undoc-members: + :show-inheritance: + qonnx.analysis.topology ----------------------------- diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.hls.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.hls.rst new file mode 100644 index 0000000000..5a4fff6052 --- /dev/null +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.hls.rst @@ -0,0 +1,184 @@ +***************************** +Custom Op - fpgadataflow.hls +***************************** + +HLS Custom Op Nodes +=================== + +finn.custom\_op.fpgadataflow.addstreams\_hls +--------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.addstreams_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.channelwise\_op\_hls +----------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.channelwise_op_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.checksum_hls +------------------------------------------ + +.. automodule:: finn.custom_op.fpgadataflow.hls.checksum_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.concat_hls +----------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.concat_hls + :members: + :undoc-members: + :show-inheritance: + + +finn.custom\_op.fpgadataflow.convolutioninputgenerator_hls +----------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.convolutioninputgenerator_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.downsampler_hls +--------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.downsampler_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.duplicatestreams\_hls +------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.duplicatestreams_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.fmpadding\_hls +----------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.fmpadding_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.fmpadding\_pixel\_hls +--------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.fmpadding_pixel_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.globalaccpool\_hls +--------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.globalaccpool_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.iodma\_hls +---------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.iodma_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.labelselect\_hls +----------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.labelselect_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.lookup\_hls +------------------------------------------ + +.. automodule:: finn.custom_op.fpgadataflow.hls.lookup_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.matrixvectoractivation_hls +-------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls + :members: + :undoc-members: + :show-inheritance: + + +finn.custom\_op.fpgadataflow.pool\_hls +----------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.pool_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_hls +---------------------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.streamingeltwise\_hls +---------------------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.streamingeltwise_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.streamingmaxpool\_hls +----------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.thresholding\_hls +------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.thresholding_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.tlastmarker\_hls +----------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.tlastmarker_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.upsampler\_hls +--------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.upsampler_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.vectorvectoractivation\_hls +--------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst index 3627855cfb..25aafc324e 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst @@ -2,71 +2,71 @@ Custom Op - fpgadataflow ************************ -HLS Custom Op Nodes -=================== +Submodules +========== -Base Class ----------- +.. toctree:: + :maxdepth: 2 -.. automodule:: finn.custom_op.fpgadataflow.hlscustomop - :members: - :undoc-members: - :show-inheritance: + finn.custom_op.fpgadataflow.hls + finn.custom_op.fpgadataflow.rtl -finn.custom\_op.fpgadataflow.addstreams\_batch ------------------------------------------------ -.. automodule:: finn.custom_op.fpgadataflow.addstreams_batch +HW Custom Op Nodes +=================== + +Base Class - HWCustomOp +------------------------ + +.. automodule:: finn.custom_op.fpgadataflow.hwcustomop :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.channelwise\_op\_batch ------------------------------------------------------ +HLSBackend +----------- -.. automodule:: finn.custom_op.fpgadataflow.channelwise_op_batch +.. automodule:: finn.custom_op.fpgadataflow.hlsbackend :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.checksum --------------------------------------- +RTLBackend +----------- -.. automodule:: finn.custom_op.fpgadataflow.checksum +.. automodule:: finn.custom_op.fpgadataflow.rtlbackend :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.concat -------------------------------------- +finn.custom\_op.fpgadataflow.addstreams +---------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.concat +.. automodule:: finn.custom_op.fpgadataflow.addstreams :members: :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.channelwise\_op +--------------------------------------------- -finn.custom\_op.fpgadataflow.convolutioninputgenerator --------------------------------------------------------- - -.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator +.. automodule:: finn.custom_op.fpgadataflow.channelwise_op :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.convolutioninputgenerator1d -------------------------------------------------------------- +finn.custom\_op.fpgadataflow.concat +------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator1d +.. automodule:: finn.custom_op.fpgadataflow.concat :members: :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.convolutioninputgenerator +-------------------------------------------------------- -finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl ------------------------------------------------------------- - -.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl +.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator :members: :undoc-members: :show-inheritance: @@ -79,52 +79,42 @@ finn.custom\_op.fpgadataflow.downsampler :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.duplicatestreams\_batch -------------------------------------------------------- +finn.custom\_op.fpgadataflow.duplicatestreams +---------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.duplicatestreams_batch +.. automodule:: finn.custom_op.fpgadataflow.duplicatestreams :members: :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.fmpadding +--------------------------------------- -finn.custom\_op.fpgadataflow.eltwise -------------------------------------- - -.. automodule:: finn.custom_op.fpgadataflow.eltwise +.. automodule:: finn.custom_op.fpgadataflow.fmpadding :members: :undoc-members: :show-inheritance: - -finn.custom\_op.fpgadataflow.fmpadding\_batch +finn.custom\_op.fpgadataflow.fmpadding\_pixel ----------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.fmpadding_batch +.. automodule:: finn.custom_op.fpgadataflow.fmpadding_pixel :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.globalaccpool\_batch ---------------------------------------------------- +finn.custom\_op.fpgadataflow.globalaccpool +------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.globalaccpool_batch +.. automodule:: finn.custom_op.fpgadataflow.globalaccpool :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.iodma ------------------------------------- - -.. automodule:: finn.custom_op.fpgadataflow.iodma - :members: - :undoc-members: - :show-inheritance: - -finn.custom\_op.fpgadataflow.labelselect\_batch ------------------------------------------------ +finn.custom\_op.fpgadataflow.labelselect +----------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.labelselect_batch +.. automodule:: finn.custom_op.fpgadataflow.labelselect :members: :undoc-members: :show-inheritance: @@ -138,7 +128,7 @@ finn.custom\_op.fpgadataflow.lookup :show-inheritance: finn.custom\_op.fpgadataflow.matrixvectoractivation ------------------------------------------------------------ +----------------------------------------------------- .. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation :members: @@ -146,10 +136,10 @@ finn.custom\_op.fpgadataflow.matrixvectoractivation :show-inheritance: -finn.custom\_op.fpgadataflow.pool\_batch ------------------------------------------------ +finn.custom\_op.fpgadataflow.pool +---------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.pool_batch +.. automodule:: finn.custom_op.fpgadataflow.pool :members: :undoc-members: :show-inheritance: @@ -163,59 +153,50 @@ finn.custom\_op.fpgadataflow.streamingdataflowpartition :show-inheritance: -finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_batch ----------------------------------------------------------------------- - -.. automodule:: finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch - :members: - :undoc-members: - :show-inheritance: - -finn.custom\_op.fpgadataflow.streamingfifo -------------------------------------------------- +finn.custom\_op.fpgadataflow.streamingdatawidthconverter +--------------------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.streamingfifo +.. automodule:: finn.custom_op.fpgadataflow.streamingdatawidthconverter :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.streamingmaxpool\_batch ------------------------------------------------------------ +finn.custom\_op.fpgadataflow.streamingeltwise +---------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.streamingmaxpool_batch +.. automodule:: finn.custom_op.fpgadataflow.streamingeltwise :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.templates ---------------------------------------------- +finn.custom\_op.fpgadataflow.streamingfifo +------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.templates +.. automodule:: finn.custom_op.fpgadataflow.streamingfifo :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.thresholding\_batch -------------------------------------------------------- +finn.custom\_op.fpgadataflow.streamingmaxpool +---------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.thresholding_batch +.. automodule:: finn.custom_op.fpgadataflow.streamingmaxpool :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.thresholding\_binary\_search ------------------------------------------------------------ +finn.custom\_op.fpgadataflow.templates +---------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.thresholding_binary_search +.. automodule:: finn.custom_op.fpgadataflow.templates :members: :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.thresholding +------------------------------------------ -finn.custom\_op.fpgadataflow.tlastmarker ------------------------------------------------ - -.. automodule:: finn.custom_op.fpgadataflow.tlastmarker +.. automodule:: finn.custom_op.fpgadataflow.thresholding :members: :undoc-members: :show-inheritance: diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst new file mode 100644 index 0000000000..b8a7f0d9e9 --- /dev/null +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst @@ -0,0 +1,46 @@ +***************************** +Custom Op - fpgadataflow.rtl +***************************** + +RTL Custom Op Nodes +=================== + +finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl +------------------------------------------------------------ + +.. automodule:: finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.fmpadding\_rtl +--------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.fmpadding_rtl + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_rtl +--------------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.streamingfifo\_rtl +------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.thresholding\_rtl +------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.thresholding_rtl + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/finn/source_code/finn.transformation.fpgadataflow.rst b/docs/finn/source_code/finn.transformation.fpgadataflow.rst index f7137ae347..f56b5fcf01 100644 --- a/docs/finn/source_code/finn.transformation.fpgadataflow.rst +++ b/docs/finn/source_code/finn.transformation.fpgadataflow.rst @@ -38,10 +38,10 @@ finn.transformation.fpgadataflow.compile\_cppsim :undoc-members: :show-inheritance: -finn.transformation.fpgadataflow.convert\_to\_hls\_layers ----------------------------------------------------------------- +finn.transformation.fpgadataflow.convert\_to\_hw\_layers +---------------------------------------------------------- -.. automodule:: finn.transformation.fpgadataflow.convert_to_hls_layers +.. automodule:: finn.transformation.fpgadataflow.convert_to_hw_layers :members: :undoc-members: :show-inheritance: @@ -79,22 +79,29 @@ finn.transformation.fpgadataflow.externalize\_params :show-inheritance: finn.transformation.fpgadataflow.floorplan ----------------------------------------------------- +----------------------------------------------- .. automodule:: finn.transformation.fpgadataflow.floorplan :members: :undoc-members: :show-inheritance: - finn.transformation.fpgadataflow.hlssynth\_ip ----------------------------------------------------- +----------------------------------------------- .. automodule:: finn.transformation.fpgadataflow.hlssynth_ip :members: :undoc-members: :show-inheritance: +finn.transformation.fpgadataflow.infer\_pixel\_padding\_deconv +---------------------------------------------------------------- + +.. automodule:: finn.transformation.fpgadataflow.infer_pixel_padding_deconv + :members: + :undoc-members: + :show-inheritance: + finn.transformation.fpgadataflow.insert\_dwc --------------------------------------------------- @@ -139,14 +146,6 @@ finn.transformation.fpgadataflow.insert\_tlastmarker :undoc-members: :show-inheritance: -finn.transformation.fpgadataflow.make\_deployment --------------------------------------------------------- - -.. automodule:: finn.transformation.fpgadataflow.make_deployment - :members: - :undoc-members: - :show-inheritance: - finn.transformation.fpgadataflow.make\_pynq\_driver ---------------------------------------------------------- @@ -238,16 +237,24 @@ finn.transformation.fpgadataflow.set\_folding :undoc-members: :show-inheritance: -finn.transformation.fpgadataflow.synth\_ooc +finn.transformation.fpgadataflow.specialize\_layers ------------------------------------------------------- +.. automodule:: finn.transformation.fpgadataflow.specialize_layers + :members: + :undoc-members: + :show-inheritance: + +finn.transformation.fpgadataflow.synth\_ooc +--------------------------------------------- + .. automodule:: finn.transformation.fpgadataflow.synth_ooc :members: :undoc-members: :show-inheritance: finn.transformation.fpgadataflow.template\_driver -------------------------------------------------- +--------------------------------------------------- .. automodule:: finn.transformation.fpgadataflow.template_driver :members: @@ -255,7 +262,7 @@ finn.transformation.fpgadataflow.template\_driver :show-inheritance: finn.transformation.fpgadataflow.templates -------------------------------------------------- +----------------------------------------------- .. automodule:: finn.transformation.fpgadataflow.templates :members: diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst index f42b595a50..1f4c9e495b 100644 --- a/docs/finn/source_code/finn.transformation.rst +++ b/docs/finn/source_code/finn.transformation.rst @@ -49,6 +49,14 @@ qonnx.transformation.change\_3d\_tensors\_to\_4d :undoc-members: :show-inheritance: +qonnx.transformation.change\_batchsize +---------------------------------------- + +.. automodule:: qonnx.transformation.change_batchsize + :members: + :undoc-members: + :show-inheritance: + qonnx.transformation.change\_datalayout -------------------------------------------- @@ -83,6 +91,14 @@ qonnx.transformation.double\_to\_single\_float :undoc-members: :show-inheritance: +qonnx.transformation.expose\_intermediate +------------------------------------------ + +.. automodule:: qonnx.transformation.expose_intermediate + :members: + :undoc-members: + :show-inheritance: + qonnx.transformation.extend\_partition ------------------------------------------ @@ -99,9 +115,16 @@ qonnx.transformation.extract\_conv\_bias :undoc-members: :show-inheritance: +qonnx.transformation.extract\_quant\_scale\_zeropt +------------------------------------------------ + +.. automodule:: qonnx.transformation.extract_quant_scale_zeropt + :members: + :undoc-members: + :show-inheritance: qonnx.transformation.fold\_constants ------------------------------------------- +-------------------------------------- .. automodule:: qonnx.transformation.fold_constants :members: @@ -117,7 +140,7 @@ qonnx.transformation.gemm\_to\_matmul :show-inheritance: qonnx.transformation.general ----------------------------------- +------------------------------ .. automodule:: qonnx.transformation.general :members: @@ -165,7 +188,7 @@ qonnx.transformation.lower\_convs\_to\_matmul :show-inheritance: qonnx.transformation.make\_input\_chanlast ------------------------------------------- +--------------------------------------------- .. automodule:: qonnx.transformation.make_input_chanlast :members: @@ -180,6 +203,29 @@ qonnx.transformation.merge\_onnx\_models :undoc-members: :show-inheritance: +qonnx.transformation.pruning +------------------------------ + +.. automodule:: qonnx.transformation.pruning + :members: + :undoc-members: + :show-inheritance: + +qonnx.transformation.qcdq\_to\_qonnx +---------------------------------------- + +.. automodule:: qonnx.transformation.qcdq_to_qonnx + :members: + :undoc-members: + :show-inheritance: + +qonnx.transformation.qonnx\_to\_qcdq +------------------------------------- + +.. automodule:: qonnx.transformation.qonnx_to_qcdq + :members: + :undoc-members: + :show-inheritance: qonnx.transformation.quant\_constant\_folding ---------------------------------------------- @@ -189,6 +235,13 @@ qonnx.transformation.quant\_constant\_folding :undoc-members: :show-inheritance: +qonnx.transformation.quantize\_graph +------------------------------------- + +.. automodule:: qonnx.transformation.quantize_graph + :members: + :undoc-members: + :show-inheritance: qonnx.transformation.rebalance\_conv ---------------------------------------- @@ -199,13 +252,28 @@ qonnx.transformation.rebalance\_conv :show-inheritance: qonnx.transformation.remove -------------------------------------- +---------------------------- .. automodule:: qonnx.transformation.remove :members: :undoc-members: :show-inheritance: +qonnx.transformation.resize\_conv\_to\_deconv +----------------------------------------------- + +.. automodule:: qonnx.transformation.resize_conv_to_deconv + :members: + :undoc-members: + :show-inheritance: + +qonnx.transformation.subpixel\_to\_deconv +----------------------------------------------- + +.. automodule:: qonnx.transformation.subpixel_to_deconv + :members: + :undoc-members: + :show-inheritance: finn.transformation.move\_reshape ---------------------------------------- diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst index aebd0604f4..2ec1502441 100644 --- a/docs/finn/source_code/finn.util.rst +++ b/docs/finn/source_code/finn.util.rst @@ -31,8 +31,16 @@ qonnx.util.config :undoc-members: :show-inheritance: +qonnx.util.convert +-------------------- + +.. automodule:: qonnx.util.convert + :members: + :undoc-members: + :show-inheritance: + qonnx.util.exec\_qonnx ----------------------- +------------------------ .. automodule:: qonnx.util.exec_qonnx :members: @@ -55,6 +63,37 @@ qonnx.util.onnx :undoc-members: :show-inheritance: +qonnx.util.prune\_channels +--------------------------- + +.. automodule:: qonnx.util.prune_channels + :members: + :undoc-members: + :show-inheritance: + +qonnx.util.random\_reseed +-------------------------- + +.. automodule:: qonnx.util.random_reseed + :members: + :undoc-members: + :show-inheritance: + +qonnx.util.range\_analysis +--------------------------- + +.. automodule:: qonnx.util.range_analysis + :members: + :undoc-members: + :show-inheritance: + +qonnx.util.test +-------------------- + +.. automodule:: qonnx.util.test + :members: + :undoc-members: + :show-inheritance: qonnx.util.to\_channels\_last ------------------------------ @@ -81,8 +120,6 @@ finn.util.create :undoc-members: :show-inheritance: - - finn.util.data\_packing ------------------------------ diff --git a/docs/finn/verification.rst b/docs/finn/verification.rst index e1a9ac4b31..4b1821aca1 100644 --- a/docs/finn/verification.rst +++ b/docs/finn/verification.rst @@ -4,7 +4,7 @@ Functional Verification *********************** -.. image:: ../../notebooks/end2end_example/bnn-pynq/verification.png +.. image:: ../../notebooks/end2end_example/bnn-pynq/verification.svg :scale: 70% :align: center diff --git a/docs/requirements.txt b/docs/requirements.txt index 26c05d0025..85bc1d0dcd 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -2,7 +2,9 @@ brevitas@git+https://github.com/Xilinx/brevitas@master#egg=brevitas_examples dataclasses-json==0.5.7 docutils==0.17.1 gspread==3.6.0 +importlib_resources IPython +matplotlib netron pytest pyverilator@git+https://github.com/maltanar/pyverilator@master#egg=pyverilator @@ -10,4 +12,5 @@ qonnx@git+https://github.com/fastmachinelearning/qonnx@main#egg=qonnx sphinx_rtd_theme==0.5.0 torch torchvision +tqdm vcdvcd diff --git a/requirements.txt b/requirements.txt index e03eff2c98..c2973f9432 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ ipython==8.12.2 numpy==1.24.1 onnx==1.13.0 onnxoptimizer -onnxruntime==1.15.0 +onnxruntime==1.16.1 pre-commit==3.3.2 protobuf==3.20.3 psutil==5.9.4 From 13afb71ee71e3f6e5120d6c2517fb7c8145c98f3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 10:31:37 +0000 Subject: [PATCH 227/291] updated mvu_rtl checker --- .../fpgadataflow/specialize_layers.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 94c0a87c03..25dfc0cc87 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -123,7 +123,8 @@ def _determine_impl_style(node): return "rtl" else: warn_str = """There is no RTL variant for %s. The node will automatically be - set to HLS variant.""" % ( + set to HLS variant. Please check the bit-widths to be <= 8 and ensure the + thresholds are implemented as standalone layer""" % ( node.name, ) warnings.warn(warn_str) @@ -210,21 +211,9 @@ def _mvu_rtl_possible(n): and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 ) weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 - folding_supported = ( - getCustomOp(n).get_nodeattr("MH") % getCustomOp(n).get_nodeattr("PE") == 0 - ) and (getCustomOp(n).get_nodeattr("MW") % getCustomOp(n).get_nodeattr("SIMD") == 0) - targets_dsp = getCustomOp(n).get_nodeattr("resType") in ["dsp", "auto"] - external_memmode = getCustomOp(n).get_nodeattr("mem_mode") in ["decoupled", "external"] no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 - return ( - inp_width_in_range - and weight_width_in_range - and folding_supported - and targets_dsp - and external_memmode - and no_activation - ) + return inp_width_in_range and weight_width_in_range and no_activation class SpecializeLayers(Transformation): From f1d4c2c7fdde83197dd6dc5b98789848ae4f924e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 10:34:44 +0000 Subject: [PATCH 228/291] [rtl mvau]: added more info to assertion message --- .../custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index dccdc67d00..24de50e8c3 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -193,7 +193,7 @@ def _resolve_impl_style(self, fpgapart): assert ( self.get_nodeattr("resType") != "lut" ), """LUT-based RTL-MVU implementation currently not supported! - Please change resType for {}""".format( + Please change resType for {} to 'dsp' or consider switching to HLS-based MVAU!""".format( self.onnx_node.name ) From 15ce083cf0a03e14267f0a46b53216feb8ac28bf Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 10:49:57 +0000 Subject: [PATCH 229/291] minor fix to if-branch --- src/finn/transformation/fpgadataflow/specialize_layers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 25dfc0cc87..bac92b27e1 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -128,6 +128,7 @@ def _determine_impl_style(node): node.name, ) warnings.warn(warn_str) + return "hls" if rtl_variant: return "rtl" From cfdf0bcec20fe6b4524c9efe642b97070ff54011 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 10:49:57 +0000 Subject: [PATCH 230/291] minor fix to if-branch --- src/finn/transformation/fpgadataflow/specialize_layers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 25dfc0cc87..bac92b27e1 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -128,6 +128,7 @@ def _determine_impl_style(node): node.name, ) warnings.warn(warn_str) + return "hls" if rtl_variant: return "rtl" From f87d29074cf802e5d9ae055e7f09f5c1296c88f2 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 15:45:54 +0000 Subject: [PATCH 231/291] [tests]: fixed assert statement for fifo characterization --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 7f76cf0af1..4be9e2bc2f 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -581,7 +581,7 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( def test_mvau_fifocharacterize_rtlsim( mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style ): - if preferred_impl_style == "rtl" and (mem_mode == "const" or act is not None): + if preferred_impl_style == "rtl" and (mem_mode == "internal_embedded" or act is not None): pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations") if nf == -1: nf = mh @@ -627,8 +627,8 @@ def test_mvau_fifocharacterize_rtlsim( chrc_out = node_inst.get_nodeattr("io_chrc_out") assert chrc_in.shape == (1, 2 * exp_total_cycles) assert chrc_out.shape == (1, 2 * exp_total_cycles) - # first sf cycles should read input continuously - assert (chrc_in[0, :sf] == list(range(1, sf + 1))).all() + # total number of transactions == 2*SF + assert chrc_in[0, -1] == 2 * sf # all outputs should be produced within the exp n of cycles assert chrc_out[0, exp_total_cycles] == nf From a6e4376d3599c806fa9eba367cfec1067ea5f2d5 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 15:45:54 +0000 Subject: [PATCH 232/291] [tests]: fixed assert statement for fifo characterization --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 7f76cf0af1..4be9e2bc2f 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -581,7 +581,7 @@ def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( def test_mvau_fifocharacterize_rtlsim( mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style ): - if preferred_impl_style == "rtl" and (mem_mode == "const" or act is not None): + if preferred_impl_style == "rtl" and (mem_mode == "internal_embedded" or act is not None): pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations") if nf == -1: nf = mh @@ -627,8 +627,8 @@ def test_mvau_fifocharacterize_rtlsim( chrc_out = node_inst.get_nodeattr("io_chrc_out") assert chrc_in.shape == (1, 2 * exp_total_cycles) assert chrc_out.shape == (1, 2 * exp_total_cycles) - # first sf cycles should read input continuously - assert (chrc_in[0, :sf] == list(range(1, sf + 1))).all() + # total number of transactions == 2*SF + assert chrc_in[0, -1] == 2 * sf # all outputs should be produced within the exp n of cycles assert chrc_out[0, exp_total_cycles] == nf From 79ca5726938195cd8a14917e2f267db80493b988 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 16:15:07 +0000 Subject: [PATCH 233/291] [rtl mvau]: update mem_mode options --- .../custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index 24de50e8c3..a00ba72717 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -101,7 +101,7 @@ def execute_node(self, context, graph): inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) reset_rtlsim(sim) toggle_clk(sim) - if mem_mode in ["external", "decoupled"]: + if mem_mode in ["external", "internal_decoupled"]: wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() wei = npy_to_rtlsim_input( From e2e0a4ccdb64b1f94da58d6add799343db9be457 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 16:15:20 +0000 Subject: [PATCH 234/291] [tests]: clean-up --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 4be9e2bc2f..2a22f3fc41 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -671,7 +671,7 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): output_matmul = oxe.execute_onnx(model, input_dict)["global_out"] # Create MVAU (HLS) - model = model.transform(to_hw.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) model = model.transform(GiveUniqueNodeNames()) # Apply convert-to-rtl step @@ -684,9 +684,7 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): "MVAU_rtl_0": { "PE": pe, "SIMD": simd, - "mem_mode": "decoupled", "resType": "dsp", - "preferred_impl_style": "rtl", }, } model = model.transform(ApplyConfig(folding_config)) @@ -710,7 +708,6 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"] - assert ( output_matmul == output_mvau_rtl ).all(), "Output of ONNX model not matching output of node-by-node RTLsim!" From f38e7edd6754b54f4dc29bc8d7e618d5b7bdef50 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 16:15:07 +0000 Subject: [PATCH 235/291] [rtl mvau]: update mem_mode options --- .../custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index 24de50e8c3..a00ba72717 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -101,7 +101,7 @@ def execute_node(self, context, graph): inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) reset_rtlsim(sim) toggle_clk(sim) - if mem_mode in ["external", "decoupled"]: + if mem_mode in ["external", "internal_decoupled"]: wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() wei = npy_to_rtlsim_input( From 7c8dc6ddad4e1fa815c776288105862b40f665ab Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 16:15:20 +0000 Subject: [PATCH 236/291] [tests]: clean-up --- tests/fpgadataflow/test_fpgadataflow_mvau.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 4be9e2bc2f..2a22f3fc41 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -671,7 +671,7 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): output_matmul = oxe.execute_onnx(model, input_dict)["global_out"] # Create MVAU (HLS) - model = model.transform(to_hw.InferQuantizedMatrixVectorActivation(mem_mode="decoupled")) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) model = model.transform(GiveUniqueNodeNames()) # Apply convert-to-rtl step @@ -684,9 +684,7 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): "MVAU_rtl_0": { "PE": pe, "SIMD": simd, - "mem_mode": "decoupled", "resType": "dsp", - "preferred_impl_style": "rtl", }, } model = model.transform(ApplyConfig(folding_config)) @@ -710,7 +708,6 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"] - assert ( output_matmul == output_mvau_rtl ).all(), "Output of ONNX model not matching output of node-by-node RTLsim!" From a17bb19e03f639b8c4f029681a60163cba44e2cf Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 16:43:27 +0000 Subject: [PATCH 237/291] [renaming]: renamed VectorVectorActivation to VVAU due to buffer overflow PyVerilator for long names --- .../analysis/fpgadataflow/res_estimation.py | 2 +- src/finn/custom_op/fpgadataflow/__init__.py | 4 ++-- src/finn/custom_op/fpgadataflow/hls/__init__.py | 6 ++---- .../fpgadataflow/convert_to_hw_layers.py | 8 ++++---- .../transformation/fpgadataflow/set_folding.py | 17 +++++------------ tests/fpgadataflow/test_minimize_bit_width.py | 16 ++++++++-------- 6 files changed, 22 insertions(+), 31 deletions(-) diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index c2d0cf7048..a6be1f1f53 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -62,7 +62,7 @@ def res_estimation_complete(model): if is_hls_node(node) or is_rtl_node(node): inst = registry.getCustomOp(node) op_type = node.op_type - if op_type.startswith("MVAU") or op_type.startswith("VectorVectorActivation"): + if op_type.startswith("MVAU") or op_type.startswith("VVAU"): orig_restype = inst.get_nodeattr("resType") res_dict[node.name] = [] inst.set_nodeattr("resType", "dsp") diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 6154bdc924..aed2ab7fe1 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -53,7 +53,7 @@ from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool from finn.custom_op.fpgadataflow.thresholding import Thresholding from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour -from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU custom_op = dict() @@ -62,7 +62,7 @@ custom_op["MVAU"] = MVAU custom_op["StreamingFIFO"] = StreamingFIFO custom_op["Thresholding"] = Thresholding -custom_op["VectorVectorActivation"] = VectorVectorActivation +custom_op["VVAU"] = VVAU custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition custom_op["AddStreams"] = AddStreams diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py index 6e465fd0f2..405c47a08d 100644 --- a/src/finn/custom_op/fpgadataflow/hls/__init__.py +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -51,9 +51,7 @@ from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls -from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import ( - VectorVectorActivation_hls, -) +from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VVAU_hls custom_op = dict() @@ -80,4 +78,4 @@ custom_op["TLastMarker_hls"] = TLastMarker_hls custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls custom_op["MVAU_hls"] = MVAU_hls -custom_op["VectorVectorActivation_hls"] = VectorVectorActivation_hls +custom_op["VVAU_hls"] = VVAU_hls diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index fdb892e911..59c9f6f38d 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1636,7 +1636,7 @@ def apply(self, model): model.set_tensor_shape(mt_output, mt_out_shape) # create and insert new VectorVectorActivation node new_node = helper.make_node( - "VectorVectorActivation", + "VVAU", [mm_input, mm_weight, mt_thres], [mt_output], domain="finn.custom_op.fpgadataflow", @@ -1651,7 +1651,7 @@ def apply(self, model): outputDataType=odt.name, ActVal=actval, noActivation=0, - name="VectorVectorActivation_" + n.name, + name="VVAU_" + n.name, ) graph.node.insert(node_ind, new_node) # remove old nodes @@ -1665,7 +1665,7 @@ def apply(self, model): model.set_tensor_shape(mm_output, mm_out_shape) # create and insert new VVAU node new_node = helper.make_node( - "VectorVectorActivation", + "VVAU", [mm_input, mm_weight], [mm_output], domain="finn.custom_op.fpgadataflow", @@ -1680,7 +1680,7 @@ def apply(self, model): outputDataType=odt.name, ActVal=0, noActivation=1, - name="VectorVectorActivation_" + n.name, + name="VVAU_" + n.name, ) graph.node.insert(node_ind, new_node) # remove old node diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index bff64d3885..10dd829971 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -119,7 +119,7 @@ def apply(self, model): ] # these ops are preceded by depthwise SWG and have special behavior, # as explained in the SetFolding docstring - depthwise_op_exceptions = ["VectorVectorActivation_hls", "Pool_hls"] + depthwise_op_exceptions = ["VVAU_hls", "Pool_hls"] for node in graph.node: if not (is_hls_node(node) or is_rtl_node(node)): continue @@ -157,18 +157,14 @@ def apply(self, model): self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type in depthwise_op_exceptions: # init/reset SIMD of VVAU - if op_type == "VectorVectorActivation_hls": + if op_type == "VVAU_hls": node_inst.set_nodeattr("SIMD", 1) max_pe = node_inst.get_nodeattr("Channels") self.optimize_attribute_val(node_inst, max_pe, "PE") # increase SIMD for VVAU once PE is exhausted pe = node_inst.get_nodeattr("PE") cyc = node_inst.get_exp_cycles() - if ( - op_type == "VectorVectorActivation_hls" - and pe == max_pe - and cyc > self.target_cycles_per_frame - ): + if op_type == "VVAU_hls" and pe == max_pe and cyc > self.target_cycles_per_frame: max_simd = np.prod(node_inst.get_nodeattr("Kernel")) self.optimize_attribute_val(node_inst, max_simd, "SIMD") # also set the folding of the upsteam DW SWU @@ -179,15 +175,12 @@ def apply(self, model): swu_node_inst.set_nodeattr("SIMD", pe) # enable parallel_window mode of RTL SWG if needed if swu_node.op_type == "ConvolutionInputGenerator_rtl": - if ( - op_type == "VectorVectorActivation" - and node_inst.get_nodeattr("SIMD") > 1 - ): + if op_type.startswith("VVAU") and node_inst.get_nodeattr("SIMD") > 1: swu_node_inst.set_nodeattr("parallel_window", 1) else: swu_node_inst.set_nodeattr("parallel_window", 0) else: - if op_type == "VectorVectorActivation_hls": + if op_type == "VVAU_hls": ksize = np.prod(node_inst.get_nodeattr("Kernel")) elif op_type == "Pool_hls": ksize = node_inst.get_nodeattr("KernelSize") diff --git a/tests/fpgadataflow/test_minimize_bit_width.py b/tests/fpgadataflow/test_minimize_bit_width.py index 2b765610ab..4b26e7ac00 100644 --- a/tests/fpgadataflow/test_minimize_bit_width.py +++ b/tests/fpgadataflow/test_minimize_bit_width.py @@ -37,7 +37,7 @@ from typing import Optional, Union from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU -from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU from finn.transformation.fpgadataflow.minimize_accumulator_width import ( MinimizeAccumulatorWidth, ) @@ -52,7 +52,7 @@ def make_unit_test_model(wdt: DataType, idt: DataType, tdt: Optional[DataType] = inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 32, 32, 288]) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 32, 32, 64]) layer1 = helper.make_node( - "VectorVectorActivation", + "VVAU", ["inp", "params0", "thresh0"] if tdt is not None else ["inp", "params0"], ["hid"], domain="finn.custom_op.fpgadataflow", @@ -170,7 +170,7 @@ def test_minimize_weight_bit_width(wdt: DataType, rww: bool): # If runtime-writeable weights, specify as a node attribute for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MVAU, VectorVectorActivation)): + if isinstance(inst, (MVAU, VVAU)): inst.set_nodeattr("runtime_writeable_weights", int(rww)) # Apply the optimization @@ -179,14 +179,14 @@ def test_minimize_weight_bit_width(wdt: DataType, rww: bool): # Iterate through each node to make sure it functioned properly for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MVAU, VectorVectorActivation)): + if isinstance(inst, (MVAU, VVAU)): cur_wdt = DataType[inst.get_nodeattr("weightDataType")] exp_wdt = def_wdt if rww else wdt assert cur_wdt.bitwidth() == exp_wdt.bitwidth(), "Mismatched data types" def calculate_accumulator_bit_width( - inst: Union[MVAU, VectorVectorActivation], model: ModelWrapper + inst: Union[MVAU, VVAU], model: ModelWrapper ) -> Union[DataType, IntType]: """Calculate the accumulator bit width using the closed-form expressions derived in `Quantized Neural Networks for Low-Precision Accumulation @@ -208,7 +208,7 @@ def phi(x: float) -> float: # modify the weights based on if the node is a VVAU or MVAU if isinstance(inst, MVAU): K = inst.get_nodeattr("MW") # matrix_width = num_inputs - elif isinstance(inst, VectorVectorActivation): + elif isinstance(inst, VVAU): k_h, k_w = inst.get_nodeattr("Kernel") K = k_h * k_w # size of kernels = num_inputs fm = inst.get_nodeattr("Channels") @@ -275,7 +275,7 @@ def test_minimize_accumulator_width(wdt: DataType, idt: DataType, tdt: DataType, # If runtime-writeable weights, specify as a node attribute for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MVAU, VectorVectorActivation)): + if isinstance(inst, (MVAU, VVAU)): inst.set_nodeattr("runtime_writeable_weights", int(rww)) cur_adt = DataType[inst.get_nodeattr("accDataType")] assert cur_adt.bitwidth() == def_adt.bitwidth(), "Default data type is incorrect" @@ -286,7 +286,7 @@ def test_minimize_accumulator_width(wdt: DataType, idt: DataType, tdt: DataType, # Iterate through each node to make sure it functioned properly for node in model.graph.node: inst = getCustomOp(node) - if isinstance(inst, (MVAU, VectorVectorActivation)): + if isinstance(inst, (MVAU, VVAU)): cur_adt = DataType[inst.get_nodeattr("accDataType")] cur_odt = DataType[inst.get_nodeattr("outputDataType")] # Calculating expected accumulator bit width using a closed-form expression From 8a48cac635c9312be542e3425223e9335b71672f Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 16:44:13 +0000 Subject: [PATCH 238/291] [hls vvau]: renamed layer and added method to instantiate ip --- .../hls/vectorvectoractivation_hls.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py index c7f0576495..dc38a18f4e 100644 --- a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -31,11 +31,11 @@ from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend -from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class VectorVectorActivation_hls(VectorVectorActivation, HLSBackend): +class VVAU_hls(VVAU, HLSBackend): """Corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" def __init__(self, onnx_node, **kwargs): @@ -43,7 +43,7 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): my_attrs = {} - my_attrs.update(VectorVectorActivation.get_nodeattr_types(self)) + my_attrs.update(VVAU.get_nodeattr_types(self)) my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs @@ -464,3 +464,12 @@ def get_verilog_top_module_intf_names(self): if runtime_writable: intf_names["axilite"] = ["s_axilite"] return intf_names + + def instantiate_ip(self, cmd): + # instantiate the HLS IP + vlnv = self.get_nodeattr("ip_vlnv") + node_name = self.onnx_node.name + if self.get_nodeattr("mem_mode") == "internal_decoupled": + cmd.append("create_bd_cell -type ip -vlnv %s /%s/%s" % (vlnv, node_name, node_name)) + else: + cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, node_name)) From 400c04350846680caf315140199ec9e70658249a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 16:44:57 +0000 Subject: [PATCH 239/291] [rtl vvau]: RTL VVAU custom-op --- .../custom_op/fpgadataflow/rtl/__init__.py | 2 + .../rtl/vectorvectoractivation_rtl.py | 301 ++++++++++++++++++ 2 files changed, 303 insertions(+) create mode 100644 src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index b7a798be98..1996539042 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -35,6 +35,7 @@ StreamingDataWidthConverter_rtl, ) from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl +from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VVAU_rtl custom_op = dict() @@ -45,3 +46,4 @@ custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl custom_op["MVAU_rtl"] = MVAU_rtl +custom_op["VVAU_rtl"] = VVAU_rtl diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py new file mode 100644 index 0000000000..c138cf05d5 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py @@ -0,0 +1,301 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.util.fpgadataflow import is_versal + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class VVAU_rtl(VVAU, RTLBackend): + """Class that corresponds to finn-rtl Vector Vector Unit.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(VVAU.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + if mode == "cppsim": + VVAU.execute_node(self, context, graph) + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for VectorVectorActivation") + in_ind += 1 + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + reset_rtlsim(sim) + toggle_clk(sim) + + if mem_mode in ["external", "internal_decoupled"]: + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + dim_h, dim_w = self.get_nodeattr("Dim") + num_w_reps = dim_h * dim_w + + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def lut_estimation(self): + return 0 + + def dsp_estimation(self): + Q = self.get_nodeattr("SIMD") + return int(np.ceil(Q / 3)) + + def instantiate_ip(self, cmd): + # instantiate the RTL IP + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + + def generate_hdl(self, model, fpgapart, clk): + # Generate params as part of IP preparation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.generate_params(model, code_gen_dir) + + template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) + # add general parameters to dictionary + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + + # apply code generation to template + with open(template_path, "r") as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def _resolve_segment_len(self, clk): + # Insert pipeline registers in the DSP58 chain to meet target clock frequency + # ~0.741 ns seems the worst-case delay through first DSP + # ~0.605 ns seems to be (on average) delay for all subsequent DSPs + # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 + assert ( + clk > 0.741 + ), """Infeasible clk target of {} ns has been set, + consider lowering the targeted clock frequency!""".format( + clk + ) + critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) + max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) + dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len + return dsp_chain_len + + def _resolve_impl_style(self, fpgapart): + # Based on target device and activation/weight-width, choose the + # supported RTL compute core + assert ( + self.get_nodeattr("resType") != "lut" + ), """LUT-based RTL-VVU implementation currently not supported! + Please change resType for {} to 'dsp' or consider switching to HLS-based VVAU!""".format( + self.onnx_node.name + ) + is_versal_family = is_versal(fpgapart) + assert ( + is_versal_family + ), "DSP-based (RTL) VVU currently only supported on Versal (DSP58) devices" + + return "mvu_vvu_8sx9_dsp58" + + def prepare_codegen_default(self, fpgapart, clk): + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" + + code_gen_dict = {} + code_gen_dict["$IS_MVU$"] = [str(0)] + code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] + mw = int(np.prod(self.get_nodeattr("Kernel"))) + code_gen_dict["$MW$"] = [str(mw)] + code_gen_dict["$MH$"] = [str(self.get_nodeattr("Channels"))] + code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] + code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] + code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] + code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( + [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + ) + code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] + + return template_path, code_gen_dict + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + + return sim + + def get_all_verilog_paths(self): + "Return list of all folders containing Verilog code for this node." + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + return verilog_paths + + def get_verilog_top_filename(self): + "Return the Verilog top module filename for this node." + + verilog_file = "{}/{}_wrapper.v".format( + self.get_nodeattr("code_gen_dir_ipgen"), self.get_nodeattr("gen_top_module") + ) + return verilog_file From 84243422afbbc772d169b1f48ed06091b859da63 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 16:46:08 +0000 Subject: [PATCH 240/291] [vvau]: changed weight file generation and execution_node; accounted for possiblity of VVAU being either RTL/HLS based (influences weight storage) or parent layer being Im2Col-variant or SWG_rtl/hls --- .../fpgadataflow/vectorvectoractivation.py | 71 +++++++++++++------ 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index c5ec7e0648..efe78a6339 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -43,7 +43,7 @@ from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string -class VectorVectorActivation(HWCustomOp): +class VVAU(HWCustomOp): """Abstraction layer for HW implementation of VectorVectorActivation layers.""" def __init__(self, onnx_node, **kwargs): @@ -105,9 +105,6 @@ def get_nodeattr_types(self): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def base_op_type(self): - return "VectorVectorActivation" - def _infer_sparse_weight_tensor(self, W_conv, k_h, k_w, channels): W_sparse = np.zeros((channels, channels, k_h, k_w), dtype=np.float32) for ch in range(channels): @@ -124,7 +121,17 @@ def execute_node(self, context, graph): (_, dim_h, dim_w, _) = in_act.shape (k_h, k_w) = self.get_nodeattr("Kernel") channels = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") + producer = [x for x in graph.node if x.output[0] == node.input[0]] + exec_mode = self.get_nodeattr("exec_mode") + if ( + not bool(producer) + or producer[0].op_type == "ConvolutionInputGenerator_hls" + or (producer[0].op_type == "ConvolutionInputGenerator_rtl" and exec_mode == "rtlsim") + ): + pe = self.get_nodeattr("PE") + else: + pe = channels + # Reorder the input activations. Note that PE gets interleaved by the SWG, # so we have to untangle and for simplicity of computation assume pe=1. # Note that PE has no effect on the QONNX node @@ -183,7 +190,14 @@ def infer_node_datatype(self, model): def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] + # when performing FIFO insertion on an FC layer with ext weights, the ind + # parameter can be > 0 (referring to the weights) so handle that here + if ind == 0: + return DataType[self.get_nodeattr("inputDataType")] + elif ind == 1: + return DataType[self.get_nodeattr("weightDataType")] + else: + raise Exception("Undefined input ind for this layer type") def get_weight_datatype(self): """Returns FINN DataType of weights.""" @@ -198,7 +212,7 @@ def get_output_datatype(self, ind=0): return DataType[self.get_nodeattr("outputDataType")] def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() + i_bits = self.get_input_datatype(ind).bitwidth() simd = self.get_nodeattr("SIMD") pe = self.get_nodeattr("PE") in_width = i_bits * simd * pe @@ -499,7 +513,7 @@ def minimize_accumulator_width(self, model): # if the thresholds can be used to determine range, then adjust the range # according to the known values of the thresholds if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) # set threshold datatype (and accumulator datatype implicitly) min_threshold = thresholds.min() max_threshold = thresholds.max() @@ -508,7 +522,7 @@ def minimize_accumulator_width(self, model): warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) thresholds = np.clip(thresholds, acc_min, acc_max) model.set_initializer(self.onnx_node.input[2], thresholds) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) min_threshold = thresholds.min() max_threshold = thresholds.max() acc_min = min(min_threshold, acc_min) @@ -566,7 +580,7 @@ def minimize_weight_bit_width(self, model): self.set_nodeattr("weightDataType", wdt.name) return DataType[self.get_nodeattr("weightDataType")] - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): """Convert the original numpy weight matrix orig_weight_matrix into a form suitable for passing to the hlslib call: * ensure MH % PE == 0 @@ -691,6 +705,8 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) # PE flip for saving weights in .dat weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # SIMD & PE flip + weight_tensor_pe_simd_flipped = np.flip(weight_tensor_pe_flipped, axis=-1) # reshape weight tensor (simd_flipped and pe_flipped) to desired shape pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") @@ -700,19 +716,32 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): # flipped weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(1, -1, pe * simd) weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + # SIMD & PE flipped + weight_tensor_pe_simd_flipped = weight_tensor_pe_simd_flipped.reshape(1, -1, pe * simd) + weight_tensor_pe_simd_flipped = weight_tensor_pe_simd_flipped.copy() if weight_file_mode == "decoupled_npy": # save weight stream into npy for cppsim - np.save(weight_file_name, weight_tensor_simd_flipped) + if self.onnx_node.op_type == "VVAU_rtl": + weight_tensor_unflipped = weight_tensor_unflipped.reshape(1, -1, pe * simd) + weight_tensor_unflipped = weight_tensor_unflipped.copy() + np.save(weight_file_name, weight_tensor_unflipped) + else: + np.save(weight_file_name, weight_tensor_simd_flipped) elif weight_file_mode == "decoupled_verilog_dat": # convert weight values into hexstring weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" - ) + if self.onnx_node.op_type == "VVAU_rtl": + weight_arr = pack_innermost_dim_as_hex_string( + weight_tensor_pe_simd_flipped, export_wdt, weight_width_padded, prefix="" + ) + else: + weight_arr = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) # add zeroes to pad out file to 1024 entries - weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_arr.flatten() weight_stream = weight_stream.copy() with open(weight_file_name, "w") as f: for val in weight_stream: @@ -772,7 +801,7 @@ def generate_params(self, model, path): if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) # use UINT32 threshold export for bipolar times bipolar inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] @@ -884,11 +913,9 @@ def code_generation_ipi(self): "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + # Instantiate either the HLS or RTL IP depending on operator + self.instantiate_ip(cmd) + # instantiate a streamer and connect it to the HLS IP strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" @@ -959,7 +986,7 @@ def code_generation_ipi(self): cmd.append("save_bd_design") elif mem_mode == "internal_embedded" or mem_mode == "external": # base class impl sufficient for internal_embedded/external modes - return super().code_generation_ipi() + self.instantiate_ip(cmd) else: raise Exception("Unrecognized mem_mode for VectorVectorActivation") return cmd From 0f25d43a2938dc1950fb56a1bcd050fdc5493090 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 16:46:41 +0000 Subject: [PATCH 241/291] [transform]: added support for converting to VVAU-RTL layer --- .../fpgadataflow/specialize_layers.py | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index bac92b27e1..080f7ca5a2 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -37,7 +37,7 @@ from finn.util.fpgadataflow import is_versal -def _determine_impl_style(node): +def _determine_impl_style(node, fpgapart): optype = node.op_type # check if there is an HLS or RTL variant or both @@ -59,6 +59,11 @@ def _determine_impl_style(node): return "rtl" else: return "hls" + elif optype == "VVAU": + if _vvu_rtl_possible(node, fpgapart): + return "rtl" + else: + return "hls" return "rtl" # but if no rtl variant, set impl_style to hls elif hls_variant: @@ -129,6 +134,18 @@ def _determine_impl_style(node): ) warnings.warn(warn_str) return "hls" + elif optype == "VVAU": + if _vvu_rtl_possible(node, fpgapart): + return "rtl" + else: + warn_str = """There is no RTL variant for %s. The node will automatically be + set to HLS variant. Please check the bit-widths to be <= 8 and ensure the + thresholds are implemented as standalone layer. Note that the RTL-variant + of this layer is only supported on Versal boards""" % ( + node.name, + ) + warnings.warn(warn_str) + return "hls" if rtl_variant: return "rtl" @@ -217,6 +234,21 @@ def _mvu_rtl_possible(n): return inp_width_in_range and weight_width_in_range and no_activation +def _vvu_rtl_possible(n, fpgapart): + # Checks whether RTL-based VVU is supported + in_width_in_range = ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 + ) or ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 + and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 + ) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + is_versal_family = is_versal(fpgapart) + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + + return in_width_in_range and weight_width_in_range and is_versal_family and no_activation + + class SpecializeLayers(Transformation): """Specialize all layers to either HLS or RTL variants""" @@ -233,7 +265,7 @@ def apply(self, model): if not node.domain == "finn.custom_op.fpgadataflow": continue node_ind += 1 - impl_style = _determine_impl_style(node) + impl_style = _determine_impl_style(node, self.fpgapart) optype = node.op_type + "_" + impl_style new_node = helper.make_node( From 94f0830861092b52b55b01d787f47157d6272ea1 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 14 Mar 2024 17:55:15 +0000 Subject: [PATCH 242/291] [test]: added test for RTL-VVAU --- tests/fpgadataflow/test_fpgadataflow_vvau.py | 201 ++++++++++++++++++- 1 file changed, 192 insertions(+), 9 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index eb521f965a..98df27e3dd 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -32,21 +32,39 @@ from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.general import ( + ApplyConfig, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers @@ -117,7 +135,7 @@ def _make_single_vvau_modelwrapper( actval = 0 VVAU_node = helper.make_node( - "VectorVectorActivation", + "VVAU", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -158,10 +176,6 @@ def _make_single_vvau_modelwrapper( return model -def prepare_inputs(input_tensor): - return {"inp": input_tensor} - - # input datatype @pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["UINT4"]]) # weight datatype @@ -233,10 +247,12 @@ def test_fpgadataflow_vvau( model = _make_single_vvau_modelwrapper( W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode ) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) input_dict = prepare_inputs(x_vvau) - y_hwop = oxe.execute_onnx(model, input_dict)["outp"] - model = model.transform(SpecializeLayers()) + y_hwop = oxe.execute_onnx(model, input_dict)["global_out"] + model = model.transform(SpecializeLayers("xc7z020clg400-1")) if exec_mode == "cppsim": model = model.transform(SetExecMode("cppsim")) @@ -272,7 +288,7 @@ def test_fpgadataflow_vvau( # signed offset y_expected += act.min() - y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)["outp"] + y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)["global_out"] assert (y_hwop == y_expected).all(), "VVAU HW-op mismatches with golden output!" assert (y_produced == y_expected).all(), "VVAU specialized-op mismatches with golden output!" @@ -285,3 +301,170 @@ def test_fpgadataflow_vvau( exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) assert exp_cycles != 0 + + +def make_single_dw_conv_modelwrapper(conv_config, idt, wdt): + kernel_size, in_feature_dim, in_chn = conv_config + stride = 1 + pad = 0 + + out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad) + group = out_chn = in_chn + + conv_param_shape = [out_chn, 1, kernel_size, kernel_size] + input_shape = [1, in_chn, in_feature_dim, in_feature_dim] + output_shape = [1, out_chn, out_feature_dim, out_feature_dim] + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = group + conv_config["kernel_shape"] = [kernel_size, kernel_size] + conv_config["pads"] = [pad, pad, pad, pad] + conv_config["strides"] = [stride, stride] + + ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, input_shape) + ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, output_shape) + weights = [helper.make_tensor_value_info("weights", TensorProto.FLOAT, conv_param_shape)] + + modelproto = qonnx_make_model( + helper.make_graph( + name="conv_test", + inputs=[ifm], + outputs=[ofm], + value_info=weights, + nodes=[helper.make_node("Conv", ["ifm", "weights"], ["ofm"], **conv_config)], + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("ifm", idt) + model.set_tensor_datatype("weights", wdt) + model.set_initializer("weights", gen_finn_dt_tensor(wdt, conv_param_shape)) + + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return model + + +def prepare_inputs(input_tensor): + return {"global_in": input_tensor} + + +# kernel size (square) +@pytest.mark.parametrize("kernel_size", [3]) +# IFM size (square) +@pytest.mark.parametrize("in_feature_dim", [5]) +# input channels +@pytest.mark.parametrize("in_chn", [4]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT8"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["INT6"]]) +# targeted board +@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) +# pe +@pytest.mark.parametrize("pe", [1, 2, 4]) +# simd +@pytest.mark.parametrize("simd", [1, 3, 9]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, part, pe, simd): + # Create depthwise-separable convolution + conv_config = (kernel_size, in_feature_dim, in_chn) + model = make_single_dw_conv_modelwrapper(conv_config, idt, wdt) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # Obtain golden reference output + golden_in = gen_finn_dt_tensor( + model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in") + ) + input_dict = prepare_inputs(golden_in) + golden_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)["global_out"] + + # Convert to HLS custom-op first + model = model.transform(LowerConvsToMatMul()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + output_vvau_hw = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)[ + "global_out" + ] + assert ( + golden_out == output_vvau_hw + ).all(), "Output of ONNX model not matching output of HW-ops!" + + # Obtain second reference from HLS-based VVAU layer + model = model.transform(SpecializeLayers(part)) + model = model.transform(GiveUniqueNodeNames()) + + # Apply folding (i.e. specify to use DSPs) + folding_config = { + "Defaults": {}, + "ConvolutionInputGenerator_rtl_0": { + "SIMD": pe, + "parallel_window": 1, + }, + "VVAU_rtl_0": { + "PE": pe, + "SIMD": simd, + "resType": "dsp", + }, + } + model = model.transform(ApplyConfig(folding_config)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) + # make sure the changed datatypes are propagated through the network + model = model.transform(InferDataTypes()) + + # Run CPPsim + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + output_vvau_cppsim = oxe.execute_onnx(model, input_dict)["global_out"] + assert ( + golden_out == output_vvau_cppsim + ).all(), "Output of ONNX model not matching output of node-by-node CPPsim!" + + # Run node-by-node RTLsim + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_vvau_rtlsim = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)[ + "global_out" + ] + + assert ( + golden_out == output_vvau_rtlsim + ).all(), "Output of ONNX model not matching output of specialized HW-ops!" + + # Stitched-IP RTLsim + model = model.transform(CreateDataflowPartition()) + partition_model_path = getCustomOp( + model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + ).get_nodeattr("model") + partitioned_model = ModelWrapper(partition_model_path) + # FIFOs needed for stitched-ip RTLsim, DWC needed for VVU operating on SIMD parallelism + partitioned_model = partitioned_model.transform(InsertAndSetFIFODepths(part, 5)) + partitioned_model = partitioned_model.transform(PrepareIP(part, 5)) + partitioned_model = partitioned_model.transform(HLSSynthIP()) + partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5)) + # set top-level prop for stitched-ip rtlsim and launch + partitioned_model.set_metadata_prop("exec_mode", "rtlsim") + # transpose input since we're now simulating HW layers (NCHW --> NHWC) + input_dict["global_in"] = np.transpose(input_dict["global_in"], (0, 2, 3, 1)) + output_vvau_stitched = oxe.execute_onnx( + partitioned_model, input_dict, return_full_exec_context=True + )["global_out"] + # tranpose hardware-generated outputs NHWC -> NCHW to be comparable + output_vvau_stitched = output_vvau_stitched.transpose(0, 3, 1, 2) + + assert ( + golden_out == output_vvau_stitched + ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" From 3fe3e06052d8f11258e31009d5e52e8f30aeb3c3 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Wed, 13 Mar 2024 13:49:15 +0000 Subject: [PATCH 243/291] Broadcast quantization scale to channel dimension Added workaround to enable per tensor quantization based on channel dimensions, providing consistency with per channel quantization. --- .../qonnx/qonnx_activation_handlers.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py index 323e391df4..2617f803e7 100644 --- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py +++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py @@ -515,7 +515,8 @@ def _calculate_thresholds(self): if bit_width == 1.0: thresholds = np.empty([1, 1], dtype=np_default_dtype) thresholds[0] = 0 - return thresholds + num_thresholds = 1 + else: if narrow: num_distinct_values = 2**bit_width - 1 @@ -537,13 +538,13 @@ def _calculate_thresholds(self): for t in range(num_thresholds): thresholds[c][t] = min_threshold[c] + step[c] * t - # ToDo: The index 1 needs to be changed to -1 for the channels last format - num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1] - final_shape = (num_output_channels, num_thresholds) - if thresholds.shape != final_shape: - thresholds = np.broadcast_to(thresholds, final_shape) + # ToDo: The index 1 needs to be changed to -1 for the channels last format + num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1] + final_shape = (num_output_channels, num_thresholds) + if thresholds.shape != final_shape: + thresholds = np.broadcast_to(thresholds, final_shape) - return thresholds + return thresholds def _calculate_act_scale(self): # Gather parameters From 88f59b32c55cfb5f099e7f955180a8fbe3f0b9ee Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 15 Mar 2024 10:55:18 +0000 Subject: [PATCH 244/291] Broadcast per tensor threshold weights to all channels --- .../custom_op/fpgadataflow/rtl/thresholding_rtl.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index f30a305dfe..eaef2f30f2 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -188,6 +188,12 @@ def prepare_codegen_rtl_values(self, model): o_bitwidth = DataType[output_data_type].bitwidth() num_channels = self.get_nodeattr("NumChannels") # number of channels + # If a single threshold value is found, broadcast it to all channels + n_thres_steps = self.get_nodeattr("numSteps") + expected_shape = (num_channels, n_thres_steps) + if t_packed.shape != expected_shape: + t_packed = np.broadcast_to(t_packed, expected_shape) + channel_fold = int(num_channels / pe) for stage in range(o_bitwidth): @@ -507,6 +513,12 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): ch = self.get_nodeattr("NumChannels") n_thres_steps = self.get_nodeattr("numSteps") + # If a single threshold value is found, broadcast it to all channels + n_thres_steps = self.get_nodeattr("numSteps") + expected_shape = (ch, n_thres_steps) + if weights.shape != expected_shape: + weights = np.broadcast_to(weights, expected_shape) + width_padded = roundup_to_integer_multiple(weights.shape[1], 4) weight_padded = np.zeros((weights.shape[0], width_padded)) weight_padded[: weights.shape[0], :n_thres_steps] = weights From e7d5af3d2644a4591c52ccb22d3d236845cef2be Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 15 Mar 2024 11:00:08 +0000 Subject: [PATCH 245/291] Revert "Broadcast quantization scale to channel dimension" This reverts commit 3fe3e06052d8f11258e31009d5e52e8f30aeb3c3. --- .../qonnx/qonnx_activation_handlers.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py index 2617f803e7..323e391df4 100644 --- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py +++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py @@ -515,8 +515,7 @@ def _calculate_thresholds(self): if bit_width == 1.0: thresholds = np.empty([1, 1], dtype=np_default_dtype) thresholds[0] = 0 - num_thresholds = 1 - + return thresholds else: if narrow: num_distinct_values = 2**bit_width - 1 @@ -538,13 +537,13 @@ def _calculate_thresholds(self): for t in range(num_thresholds): thresholds[c][t] = min_threshold[c] + step[c] * t - # ToDo: The index 1 needs to be changed to -1 for the channels last format - num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1] - final_shape = (num_output_channels, num_thresholds) - if thresholds.shape != final_shape: - thresholds = np.broadcast_to(thresholds, final_shape) + # ToDo: The index 1 needs to be changed to -1 for the channels last format + num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1] + final_shape = (num_output_channels, num_thresholds) + if thresholds.shape != final_shape: + thresholds = np.broadcast_to(thresholds, final_shape) - return thresholds + return thresholds def _calculate_act_scale(self): # Gather parameters From c0a1d73ffe09c141133cb9a71fc962e4ac0d71cf Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 15 Mar 2024 11:38:48 +0000 Subject: [PATCH 246/291] [mvau]: update mem_mode name --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index db8a04b0d3..92e7b169c6 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -859,7 +859,7 @@ def get_verilog_top_module_intf_names(self): sname = self.hls_sname() if mem_mode == "external": intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # only expose axilite interface if attribute is set runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if runtime_writable: From a43d96ccc586e08013d0670da07bc133c605a9f0 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 15 Mar 2024 11:38:48 +0000 Subject: [PATCH 247/291] [mvau]: update mem_mode name --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index db8a04b0d3..92e7b169c6 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -859,7 +859,7 @@ def get_verilog_top_module_intf_names(self): sname = self.hls_sname() if mem_mode == "external": intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # only expose axilite interface if attribute is set runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if runtime_writable: From 73bfb3440fb4e590a3da7307ebed3a79b75d3ea3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 15 Mar 2024 11:46:11 +0000 Subject: [PATCH 248/291] [vvau]: moved/added get_verilog_top_module_intf_names to HW-custom op --- .../fpgadataflow/hls/vectorvectoractivation_hls.py | 13 ------------- .../fpgadataflow/vectorvectoractivation.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py index dc38a18f4e..fbae9eb9b8 100644 --- a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -452,19 +452,6 @@ def pragmas(self): ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") ) - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - mem_mode = self.get_nodeattr("mem_mode") - sname = self.hls_sname() - if mem_mode == "external": - intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) - if mem_mode == "internal_decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names - def instantiate_ip(self, cmd): # instantiate the HLS IP vlnv = self.get_nodeattr("ip_vlnv") diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index efe78a6339..7f1bf72964 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -885,6 +885,19 @@ def derive_characteristic_fxns(self, period): io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + mem_mode = self.get_nodeattr("mem_mode") + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "internal_decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names + def code_generation_ipi(self): cmd = [] # add streamer if needed From b2a87d62f7b844d991d46bf037e0c93d24b0d9aa Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 15 Mar 2024 14:54:14 +0000 Subject: [PATCH 249/291] cleaned up comments and obsolete methods --- .../hls/matrixvectoractivation_hls.py | 2 +- .../rtl/matrixvectoractivation_rtl.py | 21 ++----------------- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index 9043496328..94f8cc0845 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -35,7 +35,7 @@ from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -# ONNX i/o tensor shape assumptions for MatrixVectorActivation: +# ONNX i/o tensor shape assumptions for MatrixVectorActivation_hls: # input 0 is the input tensor, shape (.., i_size) = (..., MW) # input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) # (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index a00ba72717..d48b3a918d 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -41,10 +41,9 @@ PyVerilator = None -# ONNX i/o tensor shape assumptions for MatrixVectorActivation: +# ONNX i/o tensor shape assumptions for MatrixVectorActivation_rtl: # input 0 is the input tensor, shape (.., i_size) = (..., MW) # input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) -# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) # output 0 is the output tensor, shape (.., o_size) = (..., MH) # the ... here can be any shape (representing groups of vectors) @@ -92,7 +91,7 @@ def execute_node(self, context, graph): os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), reshaped_input, ) - elif in_ind > 2: + elif in_ind > 1: raise Exception("Unexpected input found for MatrixVectorActivation_rtl") in_ind += 1 @@ -291,19 +290,3 @@ def prepare_rtlsim(self): self.set_nodeattr("rtlsim_so", sim.lib._name) return sim - - def get_all_verilog_paths(self): - "Return list of all folders containing Verilog code for this node." - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - # Path to (System-)Verilog files used by top-module & path to top-module - verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] - return verilog_paths - - def get_verilog_top_filename(self): - "Return the Verilog top module filename for this node." - - verilog_file = "{}/{}_wrapper.v".format( - self.get_nodeattr("code_gen_dir_ipgen"), self.get_nodeattr("gen_top_module") - ) - return verilog_file From 9eb746ad4fb9d1a688df7b9e2d925a4e2223de42 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 15 Mar 2024 14:54:41 +0000 Subject: [PATCH 250/291] [mvau]: set default resType to auto --- src/finn/custom_op/fpgadataflow/matrixvectoractivation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 92e7b169c6..7bbe4c04e9 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -63,7 +63,7 @@ def get_nodeattr_types(self): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), + "resType": ("s", False, "auto", {"auto", "lut", "dsp"}), "ActVal": ("i", False, 0), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), From c3bfa3f6ce0e533a11943633b648c659d0cedd7e Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 15 Mar 2024 14:55:11 +0000 Subject: [PATCH 251/291] [folding]: add MVAU_rtl in auto-folding --- src/finn/transformation/fpgadataflow/set_folding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index a755d37a9d..cd117f835b 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -126,7 +126,7 @@ def apply(self, model): continue op_type = node.op_type node_inst = getCustomOp(node) - if op_type == "MVAU_hls": + if op_type in ["MVAU_hls", "MVAU_rtl"]: max_simd = node_inst.get_nodeattr("MW") max_pe = node_inst.get_nodeattr("MH") node_inst.set_nodeattr("PE", 1) From 2f2db73dab0960dc7691bc439bec03facbbf8ac7 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 15 Mar 2024 14:55:47 +0000 Subject: [PATCH 252/291] [transform]: added comments and extra check to prevent binaryxnor_mode MVAU to be converted to (unsupported) RTL-MVAU --- src/finn/transformation/fpgadataflow/specialize_layers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 5ba7bfac60..fa0285692f 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -209,6 +209,9 @@ def _swg_hls_possible(node): def _mvu_rtl_possible(n): # Checks whether RTL-based MVU is supported + # Currently, for DSP48 we only support 8sx8s and for + # DSP58 we support 8sx9s. Next to that, embedded thresholding + # functionality is not supported and neither binaryxnormode computation inp_width_in_range = ( DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 ) or ( @@ -217,8 +220,9 @@ def _mvu_rtl_possible(n): ) weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + not_binaryxnor_mode = getCustomOp(n).get_nodeattr("binaryXnorMode") == 0 - return inp_width_in_range and weight_width_in_range and no_activation + return inp_width_in_range and weight_width_in_range and no_activation and not_binaryxnor_mode class SpecializeLayers(Transformation): From b05002416181d364c7f46ce89d0535823f6b7c53 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 15 Mar 2024 15:49:50 +0000 Subject: [PATCH 253/291] [HWop/Tests] Cleanup of unsused fct in HWCustomOp and invalid skipping in test --- src/finn/custom_op/fpgadataflow/hwcustomop.py | 10 ---------- tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py | 3 --- 2 files changed, 13 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py index 854587afc4..57c0fec067 100644 --- a/src/finn/custom_op/fpgadataflow/hwcustomop.py +++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py @@ -126,16 +126,6 @@ def get_verilog_top_module_intf_names(self): intf_names["ap_none"] = [] return intf_names - def get_verilog_top_filename(self): - "Return the Verilog top module filename for this node." - - verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format( - self.get_nodeattr("code_gen_dir_ipgen"), - self.onnx_node.name, - self.get_verilog_top_module_name(), - ) - return verilog_file - def get_rtlsim(self): """Return a PyVerilator wrapper for the Verilator emulation library for this node.""" diff --git a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py index 5e06cf9904..cdc3a7e423 100644 --- a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py @@ -88,9 +88,6 @@ def test_convert_to_hw_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_m pad_h = pad[0] + pad[2] pad_w = pad[1] + pad[3] - if use_rtl_swg and exec_mode == "cppsim": - pytest.skip("Skip cppsim if SWG is in rtl") - if depthwise is True: group = out_chn = in_chn conv_param_shape = [out_chn, 1, k_h, k_w] From f61aa0d5e36370498791a6c0f4caa9a544bd5116 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 15 Mar 2024 16:03:58 +0000 Subject: [PATCH 254/291] add MVAU_rtl extension --- src/finn/transformation/fpgadataflow/insert_iodma.py | 2 +- src/finn/transformation/fpgadataflow/set_fifo_depths.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 96f23ca320..1c4b4d7398 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -199,7 +199,7 @@ def apply(self, model): # attached IODMA fc_extw_nodes = list( filter( - lambda x: x.op_type in ["MVAU_hls", "VectorVectorActivation_hls"] + lambda x: x.op_type in ["MVAU_hls", "MVAU_rtl", "VectorVectorActivation_hls"] and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index c60348876a..d3aab968d5 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -259,7 +259,7 @@ def __init__( def apply(self, model): # these optypes may potentially use external weights # we'll temporarily change them to use decoupled mode for FIFO sizing - extw_optypes = ["MVAU_hls", "VectorVectorActivation_hls"] + extw_optypes = ["MVAU_hls", "MVAU_rtl", "VectorVectorActivation_hls"] # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] From e4caf06d83d83f9f3889c7ded03e9f078f6b6fc3 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 15 Mar 2024 16:10:06 +0000 Subject: [PATCH 255/291] update comments --- src/finn/transformation/fpgadataflow/specialize_layers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index fa0285692f..a68b69aa45 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -209,9 +209,10 @@ def _swg_hls_possible(node): def _mvu_rtl_possible(n): # Checks whether RTL-based MVU is supported - # Currently, for DSP48 we only support 8sx8s and for - # DSP58 we support 8sx9s. Next to that, embedded thresholding - # functionality is not supported and neither binaryxnormode computation + # Currently, for DSP48 we only support computations up to + # 8sx8s and for DSP58 we support up to 8sx9s. Next to that, + # embedded thresholding functionality is not supported and + # neither binaryxnormode computation inp_width_in_range = ( DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 ) or ( From 6f07732cedc07bbf7a09ad94a7ad537f9de47a9c Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Fri, 15 Mar 2024 16:48:54 +0000 Subject: [PATCH 256/291] cleaned up with pre-commit --- finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv | 36 +++++++++++++-------------- finn-rtllib/mvu/mvu_vvu_axi.sv | 4 +-- finn-rtllib/mvu/tb/mvu_8sx9_tb.sv | 10 ++++---- finn-rtllib/mvu/tb/mvu_axi_tb.sv | 12 +-------- 4 files changed, 26 insertions(+), 36 deletions(-) diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv index 2cc6cf1bcf..3bbc7051b9 100644 --- a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -78,7 +78,8 @@ module mvu_vvu_8sx9_dsp58 #( //-------------------- Shift register for opmode select signal --------------------\\ localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) - logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). + // Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) always_ff @(posedge clk) begin if(rst) L <= '{default: 0}; @@ -115,16 +116,16 @@ module mvu_vvu_8sx9_dsp58 #( always_ff @(posedge clk) begin if (rst) A <= '{default: 0}; else if(en) begin - A[EXTERNAL_PREGS-1] <= - // synthesis translate_off - zero ? '1 : - // synthesis translate_on + A[EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on a[SIMD*k + 3*i +: LANES_OCCUPIED]; if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; end end for (genvar j=0; j 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; @@ -179,11 +180,10 @@ module mvu_vvu_8sx9_dsp58 #( end : genExternalPregWeight else begin : genInpDSPWeight for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin - assign b_in_i[i][j][8*k +: 8] = -// synthesis translate_off - zero ? '1 : -// synthesis translate_on - //PAD_BITS_WEIGHT == 0 ? w[i][3*j+k] : { {PAD_BITS_WEIGHT{w[i][3*j+k][WEIGHT_WIDTH-1]}}, w[i][3*j+k] }; + assign b_in_i[i][j][8*k +: 8] = +// synthesis translate_off + zero ? '1 : +// synthesis translate_on PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] }; end : genBin for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv index 2a7403b6b3..6498530113 100644 --- a/finn-rtllib/mvu/mvu_vvu_axi.sv +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -31,7 +31,7 @@ * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. * @details * The following compute cores are supported: - * - 4-bit MVU on DSP48 achieving 4 MACs/DSP, + * - 4-bit MVU on DSP48 achieving 4 MACs/DSP, * - (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, * - [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, * Folding hints: @@ -184,7 +184,7 @@ module mvu_vvu_axi #( uwire ovld; uwire dsp_p_t odat; if(1) begin : blkDsp - localparam int unsigned EFFECTIVE_SIMD = SIMD_UNEVEN && PUMPED_COMPUTE ? SIMD+1 : SIMD; + localparam int unsigned EFFECTIVE_SIMD = SIMD_UNEVEN && PUMPED_COMPUTE ? SIMD+1 : SIMD; localparam int unsigned DSP_SIMD = EFFECTIVE_SIMD/(PUMPED_COMPUTE+1); typedef logic [PE -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH -1:0] dsp_w_t; typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] dsp_a_t; diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv index c8bfe5370a..34b5d8eb53 100644 --- a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv @@ -40,7 +40,7 @@ module mvu_8sx9_tb(); localparam int unsigned MW = 600; localparam int unsigned SIMD = 60; localparam int unsigned SEGMENTLEN = 4; - // Bit-width config + // Bit-width config localparam int unsigned ACTIVATION_WIDTH = 8; localparam int unsigned WEIGHT_WIDTH = 4; localparam bit SIGNED_ACTIVATIONS = 1; @@ -76,7 +76,7 @@ module mvu_8sx9_tb(); for (int i = 0; i>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); - else begin + else begin $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); $stop; - end + end end NF_CNT += 1; end diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv index 51bf623831..4ed7b4bf5f 100644 --- a/finn-rtllib/mvu/tb/mvu_axi_tb.sv +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -156,16 +156,6 @@ module mvu_axi_tb(); function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); automatic output_vector_t res = '{default: 0}; - // for (int j = 0; j 1 ? $signed(a[i/SIMD/PE][i % (SIMD*PE)]) : $signed(a[i/SIMD/PE][(i)%(SIMD*PE)]) ) * $signed(w[0][i/SIMD/PE][i/PE][i%SIMD]); - // else - // res[j/PE][j%PE] = IS_MVU ? $signed(res[j/PE][j%PE]) + $signed({1'b0, a[i/SIMD][i%SIMD]}) * $signed(w[j/PE][i/SIMD][j%PE][i%SIMD]) : - // $signed(res[j/PE][j%PE]) + ( PE > 1 ? $signed({1'b0, a[i/SIMD/PE][i % (SIMD*PE)]}) : $signed({1'b0, a[i/SIMD/PE][i%(SIMD*PE)]}) ) * $signed(w[0][i/SIMD][0][i%SIMD]); - // end - // end // The input stream will have the channels interleaved for VVU when PE>1 // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: @@ -236,4 +226,4 @@ module mvu_axi_tb(); .m_axis_output_tready(outputs.rdy) ); -endmodule : mvu_axi_tb \ No newline at end of file +endmodule : mvu_axi_tb From bd16f2e611e8b28a513d2b3417bdc8c4e87f47f5 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 19 Mar 2024 11:55:44 +0000 Subject: [PATCH 257/291] [Tests] Fix checks for tests if converted to RTL MVU --- .../fpgadataflow/rtl/thresholding_rtl.py | 2 +- .../test_convert_to_hw_1d_conv_layer.py | 6 +++++- .../test_convert_to_hw_conv_layer.py | 6 +++++- .../test_convert_to_hw_layers_cnv.py | 14 ++++++-------- .../fpgadataflow/test_convert_to_hw_layers_fc.py | 16 ++++++++-------- 5 files changed, 25 insertions(+), 19 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index f30a305dfe..84fcc01439 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -296,7 +296,7 @@ def dump_rtl_data(self, dest_dir, filename, data): f.write(data) return - def generate_hdl(self, model): + def generate_hdl(self, model, fpgapart, clk): """Prepare HDL files from templates for synthesis""" # Generate a dictionary of values to put in RTL template code_gen_dict = self.prepare_codegen_rtl_values(model) diff --git a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py index cdc3a7e423..3e8f30422b 100644 --- a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py @@ -147,7 +147,11 @@ def test_convert_to_hw_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_m else: new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) new_model = new_model.transform(SpecializeLayers()) - fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] + # set folding parameters for MVAU + if new_model.get_nodes_by_op_type("MVAU_hls"): + fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] + else: + fc_node = new_model.get_nodes_by_op_type("MVAU_rtl")[0] fc_inst = getCustomOp(fc_node) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") diff --git a/tests/fpgadataflow/test_convert_to_hw_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py index ddcf386377..18fce769fc 100644 --- a/tests/fpgadataflow/test_convert_to_hw_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py @@ -135,7 +135,11 @@ def test_convert_to_hw_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode else: new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) new_model = new_model.transform(SpecializeLayers()) - fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] + # set folding parameters for MVAU + if new_model.get_nodes_by_op_type("MVAU_hls"): + fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] + else: + fc_node = new_model.get_nodes_by_op_type("MVAU_rtl")[0] fc_inst = getCustomOp(fc_node) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py index ff61867fde..71f383ca23 100644 --- a/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py @@ -58,6 +58,7 @@ from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.transformation.streamline.reorder import MakeMaxPoolNHWC +from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.test import get_test_model_trained export_onnx_path_cnv = "test_convert_to_hw_layers_cnv.onnx" @@ -101,18 +102,15 @@ def test_convert_to_hw_layers_cnv_w1a1(fused_activation): # subsequently, the FC inference will generate passthrough MVAUs if not fused_activation: model = model.transform(to_hw.InferThresholdingLayer()) - tr_nodes = model.get_nodes_by_op_type("Thresholding") - for tr in tr_nodes: - tr_inst = getCustomOp(tr) - tr_inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) model = model.transform(to_hw.InferConvInpGen()) - conv_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator") - for cnv in conv_nodes: - cnv_inst = getCustomOp(cnv) - cnv_inst.set_nodeattr("preferred_impl_style", "hls") model = model.transform(to_hw.InferStreamingMaxPool()) + for node in model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") model = model.transform(SpecializeLayers()) for node in model.graph.node: if node.op_type == "MVAU_hls": diff --git a/tests/fpgadataflow/test_convert_to_hw_layers_fc.py b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py index d00521f09f..746ded9074 100644 --- a/tests/fpgadataflow/test_convert_to_hw_layers_fc.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py @@ -84,22 +84,22 @@ def test_convert_to_hw_layers_tfc_w1a1(): model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) model = model.transform(SpecializeLayers()) fc0 = model.graph.node[2] - assert fc0.op_type == "MVAU_hls" + assert fc0.op_type.startswith("MVAU") assert model.get_tensor_shape(fc0.input[0]) == [1, 784] assert model.get_tensor_shape(fc0.input[1]) == [784, 64] assert model.get_tensor_shape(fc0.input[2]) == [64, 1] fc1 = model.graph.node[3] - assert fc1.op_type == "MVAU_hls" + assert fc1.op_type.startswith("MVAU") assert model.get_tensor_shape(fc1.input[0]) == [1, 64] assert model.get_tensor_shape(fc1.input[1]) == [64, 64] assert model.get_tensor_shape(fc1.input[2]) == [64, 1] fc2 = model.graph.node[4] - assert fc2.op_type == "MVAU_hls" + assert fc2.op_type.startswith("MVAU") assert model.get_tensor_shape(fc2.input[0]) == [1, 64] assert model.get_tensor_shape(fc2.input[1]) == [64, 64] assert model.get_tensor_shape(fc2.input[2]) == [64, 1] fc3 = model.graph.node[5] - assert fc3.op_type == "MVAU_hls" + assert fc3.op_type.startswith("MVAU") assert model.get_tensor_shape(fc3.input[0]) == [1, 64] assert model.get_tensor_shape(fc3.input[1]) == [64, 10] @@ -157,22 +157,22 @@ def test_convert_to_hw_layers_tfc_w1a2(): model = model.transform(SpecializeLayers()) fc0 = model.graph.node[2] - assert fc0.op_type == "MVAU_hls" + assert fc0.op_type.startswith("MVAU") assert model.get_tensor_shape(fc0.input[0]) == [1, 784] assert model.get_tensor_shape(fc0.input[1]) == [784, 64] assert model.get_tensor_shape(fc0.input[2]) == [64, 2] fc1 = model.graph.node[3] - assert fc1.op_type == "MVAU_hls" + assert fc1.op_type.startswith("MVAU") assert model.get_tensor_shape(fc1.input[0]) == [1, 64] assert model.get_tensor_shape(fc1.input[1]) == [64, 64] assert model.get_tensor_shape(fc1.input[2]) == [64, 2] fc2 = model.graph.node[4] - assert fc2.op_type == "MVAU_hls" + assert fc2.op_type.startswith("MVAU") assert model.get_tensor_shape(fc2.input[0]) == [1, 64] assert model.get_tensor_shape(fc2.input[1]) == [64, 64] assert model.get_tensor_shape(fc2.input[2]) == [64, 2] fc3 = model.graph.node[5] - assert fc3.op_type == "MVAU_hls" + assert fc3.op_type.startswith("MVAU") assert model.get_tensor_shape(fc3.input[0]) == [1, 64] assert model.get_tensor_shape(fc3.input[1]) == [64, 10] fc0w = getCustomOp(fc0) From f70f53127e6ed1ce180e70f387630aa17942ed9f Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 19 Mar 2024 13:13:37 +0000 Subject: [PATCH 258/291] [Tests] Update tests --- .../fpgadataflow/convert_to_hw_layers.py | 3 +-- ...pgadataflow_convinputgenerator_rtl_dynamic.py | 1 + tests/fpgadataflow/test_fpgadataflow_deconv.py | 16 ++++++++++++---- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 27f257b917..d5f5fb4dee 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -44,9 +44,8 @@ class InferConvInpGen(Transformation): """Convert Im2Col layers to ConvolutionInputGenerator layers.""" - def __init__(self, use_rtl_variant=False): + def __init__(self): super().__init__() - self.use_rtl_variant = use_rtl_variant def apply(self, model): graph = model.graph diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py index 766a294977..3ad0bc4324 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py @@ -270,6 +270,7 @@ def test_fpgadataflow_conv_dynamic(cfg): getCustomOp(swg_node).set_nodeattr("inFIFODepths", [16]) getCustomOp(swg_node).set_nodeattr("outFIFODepths", [16]) comp_nodes = model.get_nodes_by_op_type("MVAU_hls") + comp_nodes += model.get_nodes_by_op_type("MVAU_rtl") comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation_hls") for comp_node in comp_nodes: if depthwise: diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py index ce8e1ce003..28e58bfba9 100644 --- a/tests/fpgadataflow/test_fpgadataflow_deconv.py +++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py @@ -166,7 +166,7 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, y_expected = oxe.execute_onnx(ref_model, input_dict)["outp"] model = ref_model.transform(InferPixelPaddingDeconv()) - model = model.transform(InferConvInpGen(use_rtl_variant=convinpgen_rtl)) + model = model.transform(InferConvInpGen()) model = model.transform(InferQuantizedMatrixVectorActivation()) model = model.transform(InferShapes()) model = model.transform(GiveUniqueNodeNames()) @@ -174,7 +174,6 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, for n in model.graph.node: if n.op_type == "ConvolutionInputGenerator" and not convinpgen_rtl: convinputgen_node = getCustomOp(n) - convinputgen_node.set_nodeattr("SIMD", simd) # to test cppsim, set preferred_impl_style for swg to hls convinputgen_node.set_nodeattr("preferred_impl_style", "hls") elif n.op_type == "FMPadding": @@ -182,13 +181,22 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, pad_node.set_nodeattr("preferred_impl_style", "hls") elif n.op_type == "MVAU": mvau_node = getCustomOp(n) - mvau_node.set_nodeattr("PE", pe) - mvau_node.set_nodeattr("SIMD", simd) + mvau_node.set_nodeattr("preferred_impl_style", "hls") y_produced = oxe.execute_onnx(model, input_dict)["outp"] assert (y_produced == y_expected).all() model = model.transform(SpecializeLayers()) + + for n in model.graph.node: + if n.op_type.startswith("ConvolutionInputGenerator"): + convinputgen_node = getCustomOp(n) + convinputgen_node.set_nodeattr("SIMD", simd) + elif n.op_type.startswith("MVAU"): + mvau_node = getCustomOp(n) + mvau_node.set_nodeattr("PE", pe) + mvau_node.set_nodeattr("SIMD", simd) + expected_oshape = (1, ofm_ch, odim_h, odim_w) # cppsim From 3524e169f2ba96dfcfa008e6932b66ccb54fd589 Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 19 Mar 2024 17:26:59 +0000 Subject: [PATCH 259/291] [Tests] Add minimize accumulator width to deconv test --- .../fpgadataflow/test_fpgadataflow_deconv.py | 24 ++++--------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py index 28e58bfba9..f1fc989066 100644 --- a/tests/fpgadataflow/test_fpgadataflow_deconv.py +++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py @@ -49,6 +49,9 @@ from finn.transformation.fpgadataflow.infer_pixel_padding_deconv import ( InferPixelPaddingDeconv, ) +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim @@ -147,14 +150,6 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, idim_h, idim_w = idim stride_h, stride_w = stride - if idim_h == idim_w and stride_h == stride_w: - convinpgen_rtl = False - else: - convinpgen_rtl = True - - if exec_mode == "cppsim" and convinpgen_rtl: - pytest.skip("ConvolutionInputGenerator_rtl has no cppsim, skipping cppsim") - ref_model = set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding) odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1 @@ -171,22 +166,11 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, model = model.transform(InferShapes()) model = model.transform(GiveUniqueNodeNames()) - for n in model.graph.node: - if n.op_type == "ConvolutionInputGenerator" and not convinpgen_rtl: - convinputgen_node = getCustomOp(n) - # to test cppsim, set preferred_impl_style for swg to hls - convinputgen_node.set_nodeattr("preferred_impl_style", "hls") - elif n.op_type == "FMPadding": - pad_node = getCustomOp(n) - pad_node.set_nodeattr("preferred_impl_style", "hls") - elif n.op_type == "MVAU": - mvau_node = getCustomOp(n) - mvau_node.set_nodeattr("preferred_impl_style", "hls") - y_produced = oxe.execute_onnx(model, input_dict)["outp"] assert (y_produced == y_expected).all() model = model.transform(SpecializeLayers()) + model = model.transform(MinimizeAccumulatorWidth()) for n in model.graph.node: if n.op_type.startswith("ConvolutionInputGenerator"): From 9c8406bb682de6a8fe141e71a8f5914b14c5d09b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 19 Mar 2024 17:51:51 +0000 Subject: [PATCH 260/291] [transform]: updated comment VVU-RTL checker --- src/finn/transformation/fpgadataflow/specialize_layers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index fbcc2a48b4..628de08a3e 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -245,6 +245,8 @@ def _mvu_rtl_possible(n): def _vvu_rtl_possible(n, fpgapart): # Checks whether RTL-based VVU is supported + # Currently, we only support RTL-VVU on DSP58 up to 8sx9s inputs. + # Next to that, embedded thresholding functionality is not supported in_width_in_range = ( DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 ) or ( From 212c44aab785a5b129a9403d578634c50a63b360 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 20 Mar 2024 11:35:42 +0000 Subject: [PATCH 261/291] [transform]: fix to default to HLS MVAU if bit-width < 4 --- src/finn/transformation/fpgadataflow/specialize_layers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index a68b69aa45..cabbd26a65 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -59,7 +59,13 @@ def _determine_impl_style(node): return "hls" if rtl_variant: if optype == "MVAU": - if _mvu_rtl_possible(node): + inp_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("inputDataType")].bitwidth() >= 4 + ) + weight_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("weightDataType")].bitwidth() >= 4 + ) + if inp_width_fit and weight_width_fit and _mvu_rtl_possible(node): return "rtl" else: return "hls" From f7e1a83a58969fe8bbd1e37193f2c147233726ec Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 20 Mar 2024 11:50:06 +0000 Subject: [PATCH 262/291] [rtl vvau]: removed unused methods --- .../rtl/vectorvectoractivation_rtl.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py index c138cf05d5..b315d913e4 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py @@ -283,19 +283,3 @@ def prepare_rtlsim(self): self.set_nodeattr("rtlsim_so", sim.lib._name) return sim - - def get_all_verilog_paths(self): - "Return list of all folders containing Verilog code for this node." - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - # Path to (System-)Verilog files used by top-module & path to top-module - verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] - return verilog_paths - - def get_verilog_top_filename(self): - "Return the Verilog top module filename for this node." - - verilog_file = "{}/{}_wrapper.v".format( - self.get_nodeattr("code_gen_dir_ipgen"), self.get_nodeattr("gen_top_module") - ) - return verilog_file From 77046541ef020be2c89d1ed581045167b210956a Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 20 Mar 2024 11:52:48 +0000 Subject: [PATCH 263/291] renamed VectorVectorActivation_{hls,rtl} to VVAU_{hls,rtl} --- src/finn/transformation/fpgadataflow/insert_iodma.py | 2 +- .../transformation/fpgadataflow/set_fifo_depths.py | 2 +- src/finn/transformation/fpgadataflow/set_folding.py | 12 ++++++++---- tests/end2end/test_end2end_mobilenet_v1.py | 2 +- .../fpgadataflow/test_convert_to_hw_1d_conv_layer.py | 2 +- tests/fpgadataflow/test_convert_to_hw_conv_layer.py | 2 +- ...st_fpgadataflow_convinputgenerator_rtl_dynamic.py | 3 ++- tests/fpgadataflow/test_fpgadataflow_vvau.py | 2 +- 8 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 1c4b4d7398..91d4ab1559 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -199,7 +199,7 @@ def apply(self, model): # attached IODMA fc_extw_nodes = list( filter( - lambda x: x.op_type in ["MVAU_hls", "MVAU_rtl", "VectorVectorActivation_hls"] + lambda x: x.op_type in ["MVAU_hls", "MVAU_rtl", "VVAU_hls", "VVAU_rtl"] and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index d3aab968d5..82ee536d50 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -259,7 +259,7 @@ def __init__( def apply(self, model): # these optypes may potentially use external weights # we'll temporarily change them to use decoupled mode for FIFO sizing - extw_optypes = ["MVAU_hls", "MVAU_rtl", "VectorVectorActivation_hls"] + extw_optypes = ["MVAU_hls", "MVAU_rtl", "VVAU_hls", "VVAU_rtl"] # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 1d11e91125..eaee499e6a 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -120,7 +120,7 @@ def apply(self, model): ] # these ops are preceded by depthwise SWG and have special behavior, # as explained in the SetFolding docstring - depthwise_op_exceptions = ["VVAU_hls", "Pool_hls"] + depthwise_op_exceptions = ["VVAU_hls", "VVAU_rtl", "Pool_hls"] for node in graph.node: if not (is_hls_node(node) or is_rtl_node(node)): continue @@ -158,14 +158,18 @@ def apply(self, model): self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type in depthwise_op_exceptions: # init/reset SIMD of VVAU - if op_type == "VVAU_hls": + if op_type in ["VVAU_hls", "VVAU_rtl"]: node_inst.set_nodeattr("SIMD", 1) max_pe = node_inst.get_nodeattr("Channels") self.optimize_attribute_val(node_inst, max_pe, "PE") # increase SIMD for VVAU once PE is exhausted pe = node_inst.get_nodeattr("PE") cyc = node_inst.get_exp_cycles() - if op_type == "VVAU_hls" and pe == max_pe and cyc > self.target_cycles_per_frame: + if ( + op_type in ["VVAU_hls", "VVAU_rtl"] + and pe == max_pe + and cyc > self.target_cycles_per_frame + ): max_simd = np.prod(node_inst.get_nodeattr("Kernel")) self.optimize_attribute_val(node_inst, max_simd, "SIMD") # also set the folding of the upsteam DW SWU @@ -181,7 +185,7 @@ def apply(self, model): else: swu_node_inst.set_nodeattr("parallel_window", 0) else: - if op_type == "VVAU_hls": + if op_type in ["VVAU_hls", "VVAU_rtl"]: ksize = np.prod(node_inst.get_nodeattr("Kernel")) elif op_type == "Pool_hls": ksize = node_inst.get_nodeattr("KernelSize") diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index eec303d29e..86b698278e 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -284,7 +284,7 @@ def test_end2end_mobilenet_folding(): getCustomOp(fc_layers[0]).set_nodeattr("resType", first_layer_res_type) # set up folding for the depthwise conv layers impl'd by VVAUs # each value is PE for a layer - vvau_layers = model.get_nodes_by_op_type("VectorVectorActivation_hls") + vvau_layers = model.get_nodes_by_op_type("VVAU_hls") folding = [32, 32, 64, 16, 32, 8, 16, 16, 16, 16, 16, 4, 8] for vvau, pe in zip(vvau_layers, folding): vvau_inst = getCustomOp(vvau) diff --git a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py index 3e8f30422b..c5d0281203 100644 --- a/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py @@ -187,7 +187,7 @@ def test_convert_to_hw_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_m assert padding_inst.get_nodeattr("SIMD") == in_chn if depthwise is True and exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("VectorVectorActivation_hls")[0] + node = new_model.get_nodes_by_op_type("VVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_convert_to_hw_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py index 18fce769fc..61f8af7806 100644 --- a/tests/fpgadataflow/test_convert_to_hw_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py @@ -189,7 +189,7 @@ def test_convert_to_hw_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode assert padding_inst.get_nodeattr("SIMD") == in_chn if depthwise is True and exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("VectorVectorActivation_hls")[0] + node = new_model.get_nodes_by_op_type("VVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py index 3ad0bc4324..6c0712b7b0 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py @@ -271,7 +271,8 @@ def test_fpgadataflow_conv_dynamic(cfg): getCustomOp(swg_node).set_nodeattr("outFIFODepths", [16]) comp_nodes = model.get_nodes_by_op_type("MVAU_hls") comp_nodes += model.get_nodes_by_op_type("MVAU_rtl") - comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation_hls") + comp_nodes += model.get_nodes_by_op_type("VVAU_hls") + comp_nodes += model.get_nodes_by_op_type("VVAU_rtl") for comp_node in comp_nodes: if depthwise: getCustomOp(comp_node).set_nodeattr("PE", 4) diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index 98df27e3dd..236176faa6 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -294,7 +294,7 @@ def test_fpgadataflow_vvau( assert (y_produced == y_expected).all(), "VVAU specialized-op mismatches with golden output!" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("VectorVectorActivation_hls")[0] + node = model.get_nodes_by_op_type("VVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) From d8b251c3fa62f05f6e82c6a9612b9afbb21ec9b0 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Wed, 20 Mar 2024 11:53:21 +0000 Subject: [PATCH 264/291] [transform]: fix to default to HLS VVAU if bit-width < 4 --- src/finn/transformation/fpgadataflow/specialize_layers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 628de08a3e..4e7c64bc02 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -64,7 +64,13 @@ def _determine_impl_style(node, fpgapart): else: return "hls" elif optype == "VVAU": - if _vvu_rtl_possible(node, fpgapart): + inp_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("inputDataType")].bitwidth() >= 4 + ) + weight_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("weightDataType")].bitwidth() >= 4 + ) + if inp_width_fit and weight_width_fit and _vvu_rtl_possible(node, fpgapart): return "rtl" else: return "hls" From ff31d9f3f45ccc4fbafbd0d3654ee943cf73e585 Mon Sep 17 00:00:00 2001 From: auphelia Date: Wed, 20 Mar 2024 15:45:52 +0000 Subject: [PATCH 265/291] [Tests] Infer RTL VVAUs in end2end mobilenet test --- tests/end2end/test_end2end_mobilenet_v1.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index 86b698278e..cbf89c2eae 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -77,7 +77,6 @@ from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds from finn.util.basic import alveo_default_platform, alveo_part_map, get_finn_root -from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.pytorch import NormalizePreProc from finn.util.test import ( crop_center, @@ -224,6 +223,7 @@ def test_end2end_mobilenet_convert_to_hw_layers(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_lowered.onnx") model = model.transform(to_hw.InferPool()) model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferThresholdingLayer()) model = model.transform(to_hw.InferVectorVectorActivation()) model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) model = model.transform(to_hw.InferChannelwiseLinearLayer()) @@ -237,10 +237,6 @@ def test_end2end_mobilenet_convert_to_hw_layers(): @pytest.mark.end2end def test_end2end_mobilenet_specialize_layers(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hw_layers.onnx") - for node in model.graph.node: - if is_fpgadataflow_node(node): - inst = getCustomOp(node) - inst.set_nodeattr("preferred_impl_style", "hls") model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) @@ -253,9 +249,10 @@ def test_end2end_mobilenet_folding(): # optional extra folding to use fewer resources # applied while setting the attributes on each node assert extra_fold in [1, 2, 4] - # set up folding for the depthwise conv layers impl'd by VVAUs + # set up folding for the conv layers impl'd by MVAUs # each value is PE for a layer fc_layers = model.get_nodes_by_op_type("MVAU_hls") + fc_layers += model.get_nodes_by_op_type("MVAU_rtl") # each tuple is (PE, SIMD, ram_style) for a layer folding = [ (32, 3, "block"), @@ -285,6 +282,7 @@ def test_end2end_mobilenet_folding(): # set up folding for the depthwise conv layers impl'd by VVAUs # each value is PE for a layer vvau_layers = model.get_nodes_by_op_type("VVAU_hls") + vvau_layers += model.get_nodes_by_op_type("VVAU_rtl") folding = [32, 32, 64, 16, 32, 8, 16, 16, 16, 16, 16, 4, 8] for vvau, pe in zip(vvau_layers, folding): vvau_inst = getCustomOp(vvau) From e0cfeee9853bdb03fd95491e2ccf09c6b8325303 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Wed, 20 Mar 2024 20:48:28 +0000 Subject: [PATCH 266/291] [Thresholding rtl] Update template wrapper file names to match top module name. --- .../custom_op/fpgadataflow/rtl/thresholding_rtl.py | 14 ++++++++------ .../transformation/fpgadataflow/make_zynq_proj.py | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 84fcc01439..2db52dad50 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -291,7 +291,8 @@ def dump_rtl_data(self, dest_dir, filename, data): # when generating template files, handle a special case: # if the filename contains the word "template", replace that # with the node name to distinguish between instances - filename = filename.replace("template", self.onnx_node.name) + if "template" in filename: + filename = self.get_nodeattr("gen_top_module") + ".v" with open(os.path.join(dest_dir, filename), "w") as f: f.write(data) return @@ -304,6 +305,10 @@ def generate_hdl(self, model, fpgapart, clk): # Retrieve the destination directory for the final RTL files code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Set the 'gen_top_module' attribute for use later + # by PyVerilator and IPI generation + self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) + weights = model.get_initializer(self.onnx_node.input[1]) weights_fname = f"{code_gen_dir}/memblock.dat" self.make_weight_file(weights, "decoupled", weights_fname) @@ -317,10 +322,6 @@ def generate_hdl(self, model, fpgapart, clk): file_only_path = rtl_file_path.split("/")[-1] self.dump_rtl_data(code_gen_dir, file_only_path, data) - # Before we return - set the 'gen_top_module' attribute for use later - # by PyVerilator and IPI generation - self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) - # set ipgen_path and ip_path so that HLS-Synth transformation # and stich_ip transformation do not complain # i.e. during the HLSSynthIP() transformation @@ -459,7 +460,8 @@ def code_generation_ipi(self): """Constructs and returns the TCL commands for node instantiation as an RTL block.""" rtl_file_list = [ - x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list() + x.replace("thresholding_template_wrapper", self.get_nodeattr("gen_top_module")) + for x in self.get_rtl_file_list() ] code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index ade38ddfbf..fc2047b08e 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -64,7 +64,7 @@ def collect_ip_dirs(model, ipstitch_path): ), """The directory that should contain the generated ip blocks doesn't exist.""" ip_dirs += [ip_dir_value] - if node.op_type.startswith("MVAU") or node.op_type.startswith("Thresholding"): + if node.op_type.startswith("MVAU") or node.op_type == "Thresholding_hls": if node_inst.get_nodeattr("mem_mode") == "internal_decoupled": need_memstreamer = True ip_dirs += [ipstitch_path + "/ip"] From 1e71186ef06e5f9a4c50912f1d44f9de3e8e3b8f Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Wed, 20 Mar 2024 22:53:37 +0000 Subject: [PATCH 267/291] Update comment --- .../custom_op/fpgadataflow/rtl/thresholding_rtl.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index eaef2f30f2..aabce81a03 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -188,11 +188,11 @@ def prepare_codegen_rtl_values(self, model): o_bitwidth = DataType[output_data_type].bitwidth() num_channels = self.get_nodeattr("NumChannels") # number of channels - # If a single threshold value is found, broadcast it to all channels + # If a single threshold value is found, broadcast the value n_thres_steps = self.get_nodeattr("numSteps") expected_shape = (num_channels, n_thres_steps) - if t_packed.shape != expected_shape: - t_packed = np.broadcast_to(t_packed, expected_shape) + if t_packed.shape == (1, 1): + t_packed = np.broadcast_to(t_packed, expected_shape) channel_fold = int(num_channels / pe) @@ -513,11 +513,11 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): ch = self.get_nodeattr("NumChannels") n_thres_steps = self.get_nodeattr("numSteps") - # If a single threshold value is found, broadcast it to all channels + # If a single threshold value is found, broadcast the value n_thres_steps = self.get_nodeattr("numSteps") expected_shape = (ch, n_thres_steps) - if weights.shape != expected_shape: - weights = np.broadcast_to(weights, expected_shape) + if weights.shape == (1, 1): + weights = np.broadcast_to(weights, expected_shape) width_padded = roundup_to_integer_multiple(weights.shape[1], 4) weight_padded = np.zeros((weights.shape[0], width_padded)) From 755dacb90b2eeeec2c3314cf7c608bcb3f46c0b6 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Mar 2024 12:29:05 +0000 Subject: [PATCH 268/291] [transform]: unsigned weights currently not supported --- .../transformation/fpgadataflow/specialize_layers.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index cabbd26a65..04f37cde0d 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -216,7 +216,8 @@ def _swg_hls_possible(node): def _mvu_rtl_possible(n): # Checks whether RTL-based MVU is supported # Currently, for DSP48 we only support computations up to - # 8sx8s and for DSP58 we support up to 8sx9s. Next to that, + # 8sx8u (8-bit signed weights x 8-bit (un)signed activations) + # and for DSP58 we support up to 8sx9s. Next to that, # embedded thresholding functionality is not supported and # neither binaryxnormode computation inp_width_in_range = ( @@ -226,10 +227,17 @@ def _mvu_rtl_possible(n): and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 ) weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + signed_weights = DataType[getCustomOp(n).get_nodeattr("weightDataType")].min() < 0 no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 not_binaryxnor_mode = getCustomOp(n).get_nodeattr("binaryXnorMode") == 0 - return inp_width_in_range and weight_width_in_range and no_activation and not_binaryxnor_mode + return ( + inp_width_in_range + and weight_width_in_range + and signed_weights + and no_activation + and not_binaryxnor_mode + ) class SpecializeLayers(Transformation): From 7f29a42df00ce72133d4f16b381baa578ba9d96b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Thu, 21 Mar 2024 14:19:55 +0000 Subject: [PATCH 269/291] [transform]: RTL-VVU exclude unsigned weights --- .../fpgadataflow/specialize_layers.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index e987d21c66..917481edba 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -242,7 +242,7 @@ def _mvu_rtl_possible(n): # 8sx8u (8-bit signed weights x 8-bit (un)signed activations) # and for DSP58 we support up to 8sx9s. Next to that, # embedded thresholding functionality is not supported and - # neither binaryxnormode computation + # neither binaryxnormode computation. inp_width_in_range = ( DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 ) or ( @@ -265,8 +265,9 @@ def _mvu_rtl_possible(n): def _vvu_rtl_possible(n, fpgapart): # Checks whether RTL-based VVU is supported - # Currently, we only support RTL-VVU on DSP58 up to 8sx9s inputs. - # Next to that, embedded thresholding functionality is not supported + # Currently, we only support RTL-VVU on DSP58 up to 8sx9s inputs + # (8-bit signed weights x (9-bit signed OR 8-bit (un)signed) activations). + # Next to that, embedded thresholding functionality is not supported. in_width_in_range = ( DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 ) or ( @@ -274,10 +275,17 @@ def _vvu_rtl_possible(n, fpgapart): and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 ) weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + signed_weights = DataType[getCustomOp(n).get_nodeattr("weightDataType")].min() < 0 is_versal_family = is_versal(fpgapart) no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 - return in_width_in_range and weight_width_in_range and is_versal_family and no_activation + return ( + in_width_in_range + and weight_width_in_range + and signed_weights + and is_versal_family + and no_activation + ) class SpecializeLayers(Transformation): From a4a2ae429f5ef017231322ecdc2eced44b639861 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 21 Mar 2024 14:19:55 +0000 Subject: [PATCH 270/291] [Tests] Update bnn pynq to use rtl components - thresh and swg --- .../fpgadataflow/specialize_layers.py | 4 ---- tests/end2end/test_end2end_bnn_pynq.py | 19 +++++++------------ 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py index 04f37cde0d..c4768f2399 100644 --- a/src/finn/transformation/fpgadataflow/specialize_layers.py +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -53,10 +53,6 @@ def _determine_impl_style(node): if impl_style == "": if optype == "StreamingDataWidthConverter": return _dwc_determine_impl_style(node) - # TODO extensively test RTL thresholding - # for now use HLS component for thresholding - if optype == "Thresholding": - return "hls" if rtl_variant: if optype == "MVAU": inp_width_fit = ( diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index e90c412dae..fac50fc48b 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -95,7 +95,6 @@ MoveScalarLinearPastInvariants, ) from finn.util.basic import get_finn_root, make_build_dir, test_board_map -from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.pytorch import ToTensor from finn.util.test import ( execute_parent, @@ -132,7 +131,7 @@ def fold_tfc(model): fcl_inst.set_nodeattr("ram_style", ramstyle) fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") # set parallelism for input quantizer to be same as first layer's SIMD - inp_qnt_node = model.get_nodes_by_op_type("Thresholding_hls")[0] + inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0] inp_qnt = getCustomOp(inp_qnt_node) inp_qnt.set_nodeattr("PE", 49) inp_qnt.set_nodeattr("mem_mode", "internal_decoupled") @@ -157,7 +156,7 @@ def fold_lfc(model): fcl_inst.set_nodeattr("runtime_writeable_weights", 1) fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") # set parallelism for input quantizer to be same as first layer's SIMD - inp_qnt_node = model.get_nodes_by_op_type("Thresholding_hls")[0] + inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0] inp_qnt = getCustomOp(inp_qnt_node) inp_qnt.set_nodeattr("PE", 49) return model @@ -600,36 +599,32 @@ def test_specialize_layers(self, topology, wbits, abits, board): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers") model = load_test_checkpoint_or_skip(prev_chkpt_name) # set preferred impl style to hls for all layers - for node in model.graph.node: - if is_fpgadataflow_node(node): - inst = getCustomOp(node) - inst.set_nodeattr("preferred_impl_style", "hls") model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model.save(get_checkpoint_name(topology, wbits, abits, "specialize_layers")) exp_layer_counts = { "tfc": [ ("Reshape", 1), - ("Thresholding_hls", 1), + ("Thresholding_rtl", 1), ("MVAU_hls", 4), ("LabelSelect_hls", 1), ], "tfc-1-1": [ ("Reshape", 1), - ("Thresholding_hls", 4), + ("Thresholding_rtl", 4), ("MVAU_hls", 4), ("LabelSelect_hls", 1), ], "lfc": [ ("Reshape", 1), - ("Thresholding_hls", 1), + ("Thresholding_rtl", 1), ("MVAU_hls", 4), ("LabelSelect_hls", 1), ], "cnv": [ ("Transpose", 1), - ("Thresholding_hls", 1), - ("ConvolutionInputGenerator_hls", 6), + ("Thresholding_rtl", 1), + ("ConvolutionInputGenerator_rtl", 6), ("MVAU_hls", 9), ("StreamingMaxPool_hls", 2), ("LabelSelect_hls", 1), From 942735db544064e5039f324042d26cf3e205a4d0 Mon Sep 17 00:00:00 2001 From: auphelia Date: Thu, 21 Mar 2024 16:21:29 +0000 Subject: [PATCH 271/291] [Tests] Remove mem mode setting for RTL Thresholding --- tests/end2end/test_end2end_bnn_pynq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index fac50fc48b..a25d7e6725 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -134,7 +134,6 @@ def fold_tfc(model): inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0] inp_qnt = getCustomOp(inp_qnt_node) inp_qnt.set_nodeattr("PE", 49) - inp_qnt.set_nodeattr("mem_mode", "internal_decoupled") inp_qnt.set_nodeattr("runtime_writeable_weights", 1) return model From 2fc9590a7c1a378f112c1234041bc34b4a4456f5 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 22 Mar 2024 09:29:50 +0000 Subject: [PATCH 272/291] [Thresholding] Use new wrapper name in prepare rtlsim --- src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 9193db750b..3cbb2ba427 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -346,7 +346,8 @@ def prepare_rtlsim(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") verilog_paths = [code_gen_dir] verilog_files = [ - x.replace("template", self.onnx_node.name) for x in self.get_rtl_file_list() + x.replace("thresholding_template_wrapper", self.get_nodeattr("gen_top_module")) + for x in self.get_rtl_file_list() ] dat_files = self.get_all_meminit_filenames(abspath=True) single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") From 5df52b9c1f740f32a899417ede7042b4e0da3f28 Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 22 Mar 2024 10:34:16 +0000 Subject: [PATCH 273/291] Fix linting --- src/finn/custom_op/fpgadataflow/rtl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/__init__.py b/src/finn/custom_op/fpgadataflow/rtl/__init__.py index 3bcad9e8dd..06067a4fca 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/__init__.py +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -35,8 +35,8 @@ StreamingDataWidthConverter_rtl, ) from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl -from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VVAU_rtl from finn.custom_op.fpgadataflow.rtl.thresholding_rtl import Thresholding_rtl +from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VVAU_rtl custom_op = dict() From e1c326d38c824895d23171787c73c49977eab93f Mon Sep 17 00:00:00 2001 From: auphelia Date: Fri, 22 Mar 2024 17:10:54 +0000 Subject: [PATCH 274/291] [Docs] First sweep to update the documentation --- docs/finn/brevitas_export.rst | 10 ++-- docs/finn/command_line.rst | 48 ++++++++++-------- docs/finn/end_to_end_flow.rst | 6 ++- docs/finn/getting_started.rst | 38 +++++--------- docs/finn/hw_build.rst | 14 ++--- docs/finn/img/finn-hw-build.png | Bin 57109 -> 59034 bytes docs/finn/img/finn-stack.png | Bin 66753 -> 82992 bytes docs/finn/img/nw-prep.png | Bin 31538 -> 54279 bytes docs/finn/index.rst | 12 ++--- docs/finn/nw_prep.rst | 17 +++++-- .../finn.custom_op.fpgadataflow.rtl.rst | 16 ++++++ docs/finn/source_code/finn.transformation.rst | 4 +- docs/finn/tutorials.rst | 11 +++- docs/finn/verification.rst | 8 +-- 14 files changed, 109 insertions(+), 75 deletions(-) diff --git a/docs/finn/brevitas_export.rst b/docs/finn/brevitas_export.rst index 950b601f98..0a1c788324 100644 --- a/docs/finn/brevitas_export.rst +++ b/docs/finn/brevitas_export.rst @@ -8,11 +8,11 @@ Brevitas Export :scale: 70% :align: center -FINN expects an ONNX model as input. This can be a model trained with `Brevitas `_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks `_. Brevitas provides an export of a quantized network in ONNX representation in several flavors. -Two of the Brevitas-exported ONNX variants can be ingested by FINN: - - * FINN-ONNX: Quantized weights exported as tensors with additional attributes to mark low-precision datatypes. Quantized activations exported as MultiThreshold nodes. - * QONNX: All quantization is represented using Quant, BinaryQuant or Trunc nodes. QONNX must be converted into FINN-ONNX by :py:mod:`finn.transformation.qonnx.convert_qonnx_to_finn` +FINN expects an ONNX model as input. This can be a model trained with `Brevitas `_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks `_. +Brevitas provides an export of a quantized network in QONNX representation, which is the format that can be ingested by FINN. +In a QONNX graph, all quantization is represented using Quant, BinaryQuant or Trunc nodes. +QONNX must be converted into FINN-ONNX by :py:mod:`finn.transformation.qonnx.convert_qonnx_to_finn`. FINN-ONNX is the intermediate representation (IR) FINN uses internally. +In this IR, quantized weights are indicated through tensors with additional attributes to mark low-precision datatypes and quantized activations are expressed as MultiThreshold nodes. To work with either type of ONNX model, it is loaded into a :ref:`modelwrapper` provided by FINN. diff --git a/docs/finn/command_line.rst b/docs/finn/command_line.rst index 8c37479a28..110a522847 100644 --- a/docs/finn/command_line.rst +++ b/docs/finn/command_line.rst @@ -20,7 +20,7 @@ two command line entry points for productivity and ease-of-use: Jupyter notebook as a starting point, visualizing the model at intermediate steps and adding calls to new transformations as needed. Once you have a working flow, you can implement a command line entry for this - by using the "advanced mode" described here. + by using the "advanced mode". Simple dataflow build mode @@ -28,7 +28,7 @@ Simple dataflow build mode This mode is intended for simpler networks whose topologies resemble the FINN end-to-end examples. -It runs a fixed build flow spanning tidy-up, streamlining, HLS conversion +It runs a fixed build flow spanning tidy-up, streamlining, HW conversion and hardware synthesis. It can be configured to produce different outputs, including stitched IP for integration in Vivado IPI as well as bitfiles. @@ -43,7 +43,9 @@ To use it, first create a folder with the necessary configuration and model file 3. Create a JSON file with the build configuration. It must be named ``dataflow_build_dir/dataflow_build_config.json``. Read more about the build configuration options on :py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig`. You can find an example .json file under ``src/finn/qnn-data/build_dataflow/dataflow_build_config.json`` -4. (Optional) create a JSON file with the folding configuration. It must be named ``dataflow_build_dir/folding_config.json``. +4. (Optional) create a JSON file with the specialize layers configuration. It must be named ``dataflow_build_dir/specialize_layers_config.json`` + You can find an example .json file under ``src/finn/qnn-data/build_dataflow/specialize_layers_config.json``. +5. (Optional) create a JSON file with the folding configuration. It must be named ``dataflow_build_dir/folding_config.json``. You can find an example .json file under ``src/finn/qnn-data/build_dataflow/folding_config.json``. Instead of specifying the folding configuration, you can use the `target_fps` option in the build configuration to control the degree of parallelization for your network. @@ -59,25 +61,28 @@ as it goes through numerous steps: .. code-block:: none - Building dataflow accelerator from /home/maltanar/sandbox/build_dataflow/model.onnx + Building dataflow accelerator from build_dataflow/model.onnx Outputs will be generated at output_tfc_w1a1_Pynq-Z1 Build log is at output_tfc_w1a1_Pynq-Z1/build_dataflow.log - Running step: step_tidy_up [1/16] - Running step: step_streamline [2/16] - Running step: step_convert_to_hls [3/16] - Running step: step_create_dataflow_partition [4/16] - Running step: step_target_fps_parallelization [5/16] - Running step: step_apply_folding_config [6/16] - Running step: step_generate_estimate_reports [7/16] - Running step: step_hls_codegen [8/16] - Running step: step_hls_ipgen [9/16] - Running step: step_set_fifo_depths [10/16] - Running step: step_create_stitched_ip [11/16] - Running step: step_measure_rtlsim_performance [12/16] - Running step: step_make_pynq_driver [13/16] - Running step: step_out_of_context_synthesis [14/16] - Running step: step_synthesize_bitfile [15/16] - Running step: step_deployment_package [16/16] + Running step: step_qonnx_to_finn [1/19] + Running step: step_tidy_up [2/19] + Running step: step_streamline [3/19] + Running step: step_convert_to_hw [4/19] + Running step: step_create_dataflow_partition [5/19] + Running step: step_specialize_layers [6/19] + Running step: step_target_fps_parallelization [7/19] + Running step: step_apply_folding_config [8/19] + Running step: step_minimize_bit_width [9/19] + Running step: step_generate_estimate_reports [10/19] + Running step: step_hw_codegen [11/19] + Running step: step_hw_ipgen [12/19] + Running step: step_set_fifo_depths [13/19] + Running step: step_create_stitched_ip [14/19] + Running step: step_measure_rtlsim_performance [15/19] + Running step: step_out_of_context_synthesis [16/19] + Running step: step_synthesize_bitfile [17/19] + Running step: step_make_pynq_driver [18/19] + Running step: step_deployment_package [19/19] You can read a brief description of what each step does on @@ -99,6 +104,7 @@ The following outputs will be generated regardless of which particular outputs a * ``build_dataflow.log`` is the build logfile that will contain any warnings/errors * ``time_per_step.json`` will report the time (in seconds) each build step took * ``final_hw_config.json`` will contain the final (after parallelization, FIFO sizing etc) hardware configuration for the build +* ``template_specialize_layers_config.json`` is an example json file that can be used to set the specialize layers config * ``intermediate_models/`` will contain the ONNX file(s) produced after each build step @@ -206,3 +212,5 @@ You can launch the desired custom build flow using: This will mount the specified folder into the FINN Docker container and launch the build flow. If ```` is not specified it will default to ``build`` and thus execute ``build.py``. If it is specified, it will be ``.py``. + +If you would like to learn more about advance builder settings, please have a look at `our tutorial about this topic `_. diff --git a/docs/finn/end_to_end_flow.rst b/docs/finn/end_to_end_flow.rst index 0a022067c3..8fafde5a5e 100644 --- a/docs/finn/end_to_end_flow.rst +++ b/docs/finn/end_to_end_flow.rst @@ -2,7 +2,11 @@ End-to-End Flow *************** -The following image shows an example end-to-end flow in FINN, starting from a trained PyTorch/Brevitas network and going all the way to a running FPGA accelerator. +The following image shows an example end-to-end flow in FINN for a PYNQ board. +Please note that you can build an IP block for your neural network **for every Xilinx-AMD FPGA**, but we only provide automatic system integration for a limited number of boards. +However, you can use Vivado to integrate an IP block generated by FINN into your own design. + +The example flow in this image starts from a trained PyTorch/Brevitas network and goes all the way to a running FPGA accelerator. As you can see in the picture, FINN has a high modularity and has the property that the flow can be stopped at any point and the intermediate result can be used for further processing or other purposes. This enables a wide range of users to benefit from FINN, even if they do not use the whole flow. .. image:: ../../notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst index 6bb0f3ab1a..eae61b1a55 100644 --- a/docs/finn/getting_started.rst +++ b/docs/finn/getting_started.rst @@ -8,7 +8,7 @@ Quickstart ========== 1. Install Docker to run `without root `_ -2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2022.1``) +2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2022.2``) 3. Clone the FINN compiler from the repo: ``git clone https://github.com/Xilinx/finn/`` and go into the directory where it is cloned 4. Execute ``./run-docker.sh quicktest`` to verify your installation. 5. Optionally, follow the instructions on :ref:`PYNQ board first-time setup` or :ref:`Alveo first-time setup` for board setup. @@ -28,8 +28,8 @@ to train *customized* networks and create highly-efficient FPGA implementations In general, the approach for using the FINN framework is as follows: 1. Train your own quantized neural network (QNN) in `Brevitas `_. We have some `guidelines `_ on quantization-aware training (QAT). -2. Export to FINN-ONNX by following `this tutorial `_ . -3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial `_ +2. Export to QONNX and convert to FINN-ONNX by following `this tutorial `_ . +3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial `_ or for advanced settings have a look at this `tutorial `_ . 4. Adjust your QNN topology, quantization settings and ``build_dataflow`` configuration to get the desired results. Please note that the framework is still under development, and how well this works will depend on how similar your custom network is to the examples we provide. @@ -49,13 +49,12 @@ Running FINN in Docker ====================== FINN runs inside a Docker container, it comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources `_ to get started. You may want to review the :ref:`General FINN Docker tips` and :ref:`Environment variables` as well. -If you want to use prebuilt images, read :ref:`Using a prebuilt image`. The above mentioned script to build and launch the FINN docker container is called `run-docker.sh `_ . It can be launched in the following modes: Launch interactive shell ************************ -Simply running sh run-docker.sh without any additional arguments will create a Docker container with all dependencies and give you a terminal with you can use for development for experimentation: +Simply running bash run-docker.sh without any additional arguments will create a Docker container with all dependencies and give you a terminal with you can use for development for experimentation: :: @@ -93,11 +92,12 @@ This will launch the `Jupyter notebook `_ server inside a Environment variables ********************** -Prior to running the `run-docker.sh` script, there are several environment variables you can set to configure certain aspects of FINN. -These are summarized below: +Prior to running the ``run-docker.sh`` script, there are several environment variables you can set to configure certain aspects of FINN. +For a complete list, please have a look in the `run-docker.sh `_ file. +The most relevant are summarized below: * (required) ``FINN_XILINX_PATH`` points to your Xilinx tools installation on the host (e.g. ``/opt/Xilinx``) -* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2022.1``) +* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2022.2``) * (required for Alveo) ``PLATFORM_REPO_PATHS`` points to the Vitis platform files (DSA). * (required for Alveo) ``XRT_DEB_VERSION`` specifies the .deb to be installed for XRT inside the container (see default value in ``run-docker.sh``). * (optional) ``NUM_DEFAULT_WORKERS`` (default 4) specifies the degree of parallelization for the transformations that can be run in parallel, potentially reducing build time @@ -108,10 +108,8 @@ These are summarized below: * (optional) ``NETRON_PORT`` (default 8081) changes the port for Netron inside Docker * (optional) ``PYNQ_BOARD`` or ``ALVEO_BOARD`` specifies the type of PYNQ/Alveo board used (see "supported hardware" below) for the test suite * (optional) ``IMAGENET_VAL_PATH`` specifies the path to the ImageNet validation directory for tests. -* (optional) ``FINN_DOCKER_PREBUILT`` (default 0) if set to 1 then skip Docker image building and use the image tagged with ``FINN_DOCKER_TAG``. * (optional) ``FINN_DOCKER_TAG`` (autogenerated) specifies the Docker image tag to use. * (optional) ``FINN_DOCKER_RUN_AS_ROOT`` (default 0) if set to 1 then run Docker container as root, default is the current user. -* (optional) ``FINN_DOCKER_GPU`` (autodetected) if not 0 then expose all Nvidia GPUs or those selected by ``NVIDIA_VISIBLE_DEVICES`` to Docker container for accelerated DNN training. Requires `Nvidia Container Toolkit `_ * (optional) ``FINN_DOCKER_EXTRA`` (default "") pass extra arguments to the ``docker run`` command when executing ``./run-docker.sh`` * (optional) ``FINN_SKIP_DEP_REPOS`` (default "0") skips the download of FINN dependency repos (uses the ones already downloaded under deps/. * (optional) ``NVIDIA_VISIBLE_DEVICES`` (default "") specifies specific Nvidia GPUs to use in Docker container. Possible values are a comma-separated list of GPU UUID(s) or index(es) e.g. ``0,1,2``, ``all``, ``none``, or void/empty/unset. @@ -125,23 +123,11 @@ General FINN Docker tips * If you want a new terminal on an already-running container, you can do this with ``docker exec -it bash``. * The container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the finn compiler folder (which is mounted from the host computer) or otherwise backed up. -Using a prebuilt image -********************** - -By default the ``run-docker.sh`` script tries to re-build the Docker image with each run. After the first run this should go quite fast thanks to Docker caching. -If you are having trouble building the Docker image or need offline access, you can use prebuilt images by following these steps: - -1. Pull a prebuilt Docker image with ``docker pull maltanar/finn:`` where ```` can be ``dev_latest`` or ``main_latest`` -2. Set the ``FINN_DOCKER_TAG`` to the name of the image you just pulled e.g. ``FINN_DOCKER_TAG=maltanar/finn:dev_latest`` -3. Set ``FINN_DOCKER_PREBUILT=1`` -4. You can now launch the Docker image in all modes without re-building or any internet access. - - Supported FPGA Hardware ======================= -**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by `PYNQ `_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards, as well as Alveo cards. +**Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-o> -**Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator. +**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by `PYNQ `_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Kria SOM, Ultra96, ZCU102 and ZCU104 boards, as well as Alveo cards. PYNQ board first-time setup **************************** @@ -177,7 +163,7 @@ On the target side: On the host side: -1. Install Vitis 2022.1 and set up the ``VITIS_PATH`` environment variable to point to your installation. +1. Install Vitis 2022.2 and set up the ``VITIS_PATH`` environment variable to point to your installation. 2. Install Xilinx XRT. Ensure that the ``XRT_DEB_VERSION`` environment variable reflects which version of XRT you have installed. 3. Install the Vitis platform files for Alveo and set up the ``PLATFORM_REPO_PATHS`` environment variable to point to your installation. *This must be the same path as the target's platform files (target step 2)* 4. Set up the ``ALVEO_*`` environment variables accordingly for your target, see description of environment variables above. @@ -201,7 +187,7 @@ System Requirements * Ubuntu 18.04 with ``bash`` installed * Docker `without root `_ -* A working Vitis/Vivado 2022.1 installation +* A working Vitis/Vivado 2022.2 installation * ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables correctly set, see `Quickstart`_ * *(optional)* `Vivado/Vitis license`_ if targeting non-WebPack FPGA parts. * *(optional)* A PYNQ board with a network connection, see `PYNQ board first-time setup`_ diff --git a/docs/finn/hw_build.rst b/docs/finn/hw_build.rst index a5c486935d..9e34edc9d1 100644 --- a/docs/finn/hw_build.rst +++ b/docs/finn/hw_build.rst @@ -8,7 +8,7 @@ Hardware Build and Deployment :scale: 70% :align: center -A model where all layers have been converted to HLS layers can be processed by +A model where all layers have been converted to either HLS or RTL layers can be processed by FINN to build a bitfile and driver targeting a Zynq or Alveo system or to generate a Vivado IP Integrator (IPI) design with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. @@ -69,9 +69,11 @@ FINN will descend into each partition and insert FIFO nodes between streaming no where FIFO depths dictated by the node attributes, using the :py:mod:`finn.transformation.fpgadataflow.insert_fifo.InsertFIFO` transformation. Afterwards, IP blocks will be created for each partition, which in turn contain the -IP blocks for each layer stitched together. The layer-level IP blocks -are generated by Vivado HLS, using a sequence of :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP` +IP blocks for HLS layers and RTL modules for RTL layers stitched together. The layer-level IP blocks for HLS layers +are generated by Vitis HLS, using a sequence of :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP` and :py:mod:`finn.transformation.fpgadataflow.hlssynth_ip.HLSSynthIP` transformations. +For RTL layers calling :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP` will fill out the RTL wrapper files and store all files belonging to the RTL module in a folder. + The top-level IP blocks are generated in Vivado IPI, using the :py:mod:`finn.transformation.fpgadataflow.create_stitched_ip.CreateStitchedIP` transformation. Vivado/Vitis Project Generation and Synthesis @@ -86,7 +88,7 @@ Deployment ========== -Deployment and Remote Execution -------------------------------- +Deployment +----------- -The bitfile and the driver file(s) are copied to the PYNQ board and can be executed there. For more information see the description in the `end2end_example `_ Jupyter notebooks. +The bitfile and the driver file(s) can be copied to the PYNQ board and be executed there. For more information see the description in the `end2end_example `_ Jupyter notebooks. diff --git a/docs/finn/img/finn-hw-build.png b/docs/finn/img/finn-hw-build.png index f3a591fa8f9e25f99b44d2bd9502bf3ae979818b..412317b8d116877ec3993be1f987f639920ef366 100644 GIT binary patch literal 59034 zcmcG$byU<{)HaNQC`g9_5)#rez|bW~2@GA*-Q6Q8Akv8B3|&JC0uquE(mkYL&aA8bMLr$!#;kN6T6(~bFVz8XOJTk$$uS@SCr)mdd7Y#jT2$$<7Zd9Kea z4mcQ4(uNX9J^6q1Cp0G6>fwKLfA$N7z5Z_@1{yfye~TWXLx}(T#l;6i#DB8fyhcj@ z_uGJMTe|;5lo$aM{yQw3#s>c1_nr7)$ba82KZY|B*=u@^U!<}#POUJ{Eqe?=Sm zSrGW#7aVmwcTOUsmKeC?|Mm$j!_cuh14FXSmi%~;+8c&fuhf9=!u|$y`tlt7^L;EL zqLX?IoMQ?|-HCZ1*Ti(ZkOHiPcme zEnOw%{U-n2p93(w?Emp_T3$%0_SY}h@0#D&z+OG$GGZ4;-MI74a6QJS1}6kRJPUC{ z9d^@LX;;Gd9pObf6`3~UMe#j@gQtDh$HNk9ceM^P%Sm|8*mYkXE2j(6N}TV{>)#yp zb9JdNP>TEMo@|c3X*wPfSsoNvB-Rv!m7CT>tB{t_5sFwl9NToB!=jMJv2l#%iT2=4kHE6$Z8e zjRSJ(Hk%A=^V#S2bm~{#Vt~EXyJ#8-qm#YaDJ3r$w|SU61d@b0_+6v8?|Xf=*QFl+ zxG&<|V&VD=fs{NYgg8P>$_m~U^MdIGb=|yInkC8A=?)XCcG+gqdR(P>cckw@^OO=E zwutwB;@VJOA5V4UqXE^A*rd`i6ny!%JTN}{*Zz3^*(Wp z-2_i^-(y@P>SjY?dCq;9l&kfosz{?S_M#PaE#Z2#RTwo>ZSBj^=&~f`dbyU;kIM2p zdaL$TkMEsqfyjRS8_g2Ux8<#lv;38AYYG)NyhDO-(%pxIjBG``b_ucqu8QkdzbH1H zX%T>&1Y5p;2lKMowF$l8cSwa9Sga4g=!avtcnu9qI=?~uY?q==PCQ_!o8#dq=a$Q^ zfm)ln`bwY8oKFK5DPDWOBiPF8rS9;}qa7c&t6~Cn`kyt*_CPddsw{tbFZypJ>1ZND zoe!MrmoRySm@$5XEv>El_ef80eP2Q}()bEytC!7w ziNtVf*kUAyg0PZpDR?ZxceTU!T2CkSs~Txk=g|jy`c))u85kjO22^*AROo61Q|9L> z_s(#_eK$~tF3l9hxg_AK7FVg<=Bs?~FP1+2V#@McHfn4S-v1&fQDZ-qe4cF!PiB8d zcfdwA1Jr5?C=jSkLGIn8_bS8Ukp6(dc)Vau@r^wnbmT>;CrM?*d`;;w~^_q zEP8u_T<)d`^f(|5x*5t4=@*^L+GLD8-s=bFlu~7iNmEuXuJxz%dr%TLB$pgj=+_My zc7q}H{DPnAQ&bl!TCV!|AXew1lsi?H{i{52;#WnxtdYaQ^m4I@=OVHAQ&_{`me42s z_wnwFigNnK$0r#hcb|xA*2H;_^lYFIs;|iL*ZtSiJ*9b`;vEe?wXA=k_* z7A>6DMruD((A!73Z*U82rOZijou!2}1;mhG&+YNIn&k%X_;*L203P1>OH0S_)r)9^ zErrlY`oKms^OxAbay!S8aBGrd{@P3;0M6r%rh~DGNp$ z2ofHU0bPgXI?mP}8MH_L@d4F7-9S4J5}(Axb)AzFbc$YU_iR{)+vltObki!;P8t;m zDk4Ao$QiNO5We&AcKNEq#x$_rnZTB1_d+OZg50Gvkm1%I~lP zU!uo)b4-ydsT&bABXqr6T_D*8y|`XPan`-lMS}eJehk=D;pR7wh_`(Joo z|M#YCsI3BO^2mwr=vzkc(YL^l5=ac*!i`H_2#7QC%5{NpkCEd5us=)=d|8d*Mx*|a zQE7?+i-x?}?@e4sV~H-AB*x@W0eh(N3#fn{w0Wgz@MurZ$e12~LuYic2YD{QH{_1b zwzbaJ(Q~ZM3?Zh_xWFV|d9S&mF95mdy7`V&K8~cO5^|H~#~dYi^!{8FoH_dKg|J7_ zO>_?)MZ1m5Lo$wC`-LWGm6`!UiM%2LQ}r3xLCfPIu83d%S5*WSA)a)sQ|qmlEf4m? z7$LGpkD`BOcrR@6JMi-Gte$C_k4VE916j*#ABK14JSdsG5671IN&*kVfBt;QYyuyt z@2+HCx=CTxQs5tmqZZrdEPWDYE0tfSk|l0*Sw9KFKvTmJ`w|w$hr^xx<_* z@RGhyuId9BB2yXC9Vxc&&*#18EeRh;iSN5V>dIJxn~H$uF9Ns6i=D)WI3^D*;b@tX z(WV-Igad94yMGf&1>`(f!Sq04tXvKU-f4(8Vw1TBBCr_MbLg5t@bI5oQ&eS!joJb4 zr0$ckoJV2SO6eN~7ZLGt@^7U*f=FIK|qw z*CG2h4g60hs<@xfhnB0@d|Q6u5QGV$(`K~kDc*kvf}|!X=O__VL|^Q%3FrSD@Ycnw z4&BN&eT;knxamVe+p(D@XmvU%KH@PW(YlJv6$RMlWp^5&UlUmZh${D(U9?Ju#2a@Y4TmjxGRbm_@&xCSqzOHZmUc_3Fx zo)-@vPRTFwV$lUhaSk#O^!@OD(T@H z7y^$DFL?WE_gMlKz7Go#(Nn)wo2m@^5Y>D8GcCkqn~vbb#rb}jhPQ;rN$rr z1Q!>P0)}fYNWdYLkE!;^_)nTYyFI_usIK%BNL3d}63z`FJAUA95?n-H8&U~D+fdH> zHK21YDoy4R!^6wlJVpvcz0Liw9q3-oRNqdVJW)dm3lgJ5r-HM)cS+DxVY2KUJ#y4W zE_9&j)5}ysOZdQXNV(Nunn21q{*js&^2z1H4Wj;?c#fc+=Ssu_G)IK*c>42(bHmN$ zrqolPu*dCZfCyTYh#UG@Os)uX`08*?OMFD1ZYmV!MMpqeSphTb2)5*oRd_saU1^-1 z#u$zD#``r@s#hQa$Q7LStFD@}Tjbm7IjIbM|9=%KW)l)TLt+3IO)8TmT)Rw{Hib8j zZ__GF^5iENvdLN4VkxI17*zDI%iUxyDiSY;Fx>g>wC!-yF3icGBu!fq&>5h7nLMf( z594fmS~^k*)J?UHFvTyMix5>fV=ApBnJ36nur`D{QUczCKW z`GFrIc(55zAV=deS1*2bAZ}WP4&Eg!XuBYPH1tCV3$sgN%KFBzg9mX-L|qYXZwo9ws1Y%7n9S z&2v^lq+Jg!A1YDx6{@7?3`qh;7yD=hHef^F&~bO(ib_K5Ag~e)|74D|F|hsOhHM+D z5+|W~T!&!g1Mj=fe}hu!Xe%)&9+>h%&{pg}m#biW;1=Ap``9HGwDo~pZAE_sf{W`9 zwoCf(+8lS$r_{Xd&Lp2;NzJfqLsNmWaKyt-sS`!G0+3FiqT?sbV2ors)zP0L7F1Kq9|;u zW180ERmhK1OSlV{$q|O<-bgbsuDi4b+Wv!qbsYl13Xl^spp*@)YgOP-afw#^#yhb< zc&S9x394vh_R_o)f-j@n+PO_ITN7Z6bA=#Os;O#>PzSW#J;Xll_)}#-y+nMR4NDiD zY1&6_2$~`FVWaW(ySCdPYdJ_)P8wr+;2c|O5XMXFD0@vxB*w@DzM4_=O#7+iyU!~3 zA#&*fmyDDd7WiSi9UL&pd2mi;WVwKiM;qnR3n#ILkT7D*d2`>AMgq);m1+w24>nIG zk&jc*l`LZqWyo~#a^lMxia!9N1Zw1iv{aQmxdE%qzyc!uoF8Zmlt8 z9N655*VXW{0kL8JSDc~}ZNBJa<wu6X{`}lk}3Txj5eC^{GUHJC9rY6 zi6a^%$jNjpmG!dG>0`~+<29*v(a&G)Q+Y(N;emp67gLJ6&rCBl!CwXcz)h~hi!yLh zsnP?HBhN@ZUTLip{Sh=#8r%zh4^0&(S5>i_68|!h6}(SHQ79ml;Ip}U3-@sXFHyPcaW#=%+GQKvE4F~lIcM#5P2M;HBL4pqYUv9BwwDX+>|SE2 zCjjoGL4Ci8d-?wdzBvL+GyVHjERO~+@uVaR>`*a%*fGJHF(bJ(z>I^5V}6vYcZS2M zkN1r|6`xGN7kSpbvZ?9ICbC)iA%)b{-NFW2_!nMcOjD5|o|y64z|_kvA?U1E;zd|m zCV3!Cp}Y3;ByHnb!O({I(3y6_+MqnyqB!rNQw_CCGK_tyKRj^YU2TW~KRq|wtLimq zkIJCU<-z#CMayM(`sd^?85h|txKaurFoa?9A7hm0%(rz@l1lO7{lYo6mKy&_tHlYf z&O?+icZwr~X`QY&d6T5#a{z1NVDz9nK{SYgwt{9yB2-uw~D4>``qYyRG7 zY!X&BmcY9r(WfXfPAgWd(72VAxl`6U-yM*>0&?Hy&lrYowc7PNKonG1XT|B4-fy6q z#cOHPtD5p^CFQaH0lmh+69AOPkHZXrZ_Z1%;rD<{)#|b(T$#=i-ZJcD(0kd$&DE^Z zc#_X6Y*`~Zdl@Kgq{o@uRt2{Wt`*E$gbXU&Jd00T9e7r^y75YR2lVxqL_%N%*c)x@R930zi@$}=28-HB@#t^ITPeX&1m3X^51$2^g%smTJTry0Y> zMWP;A8e6IBb8kiHjy3apxusd4YMHA87w_fW?@(gSw|PAN@7oPQF5A2acAeSKh&QWB z{`(;MJ8Wy%E%bUbdrad9f~25bRw(u;EJO7wJ_iM`Qgvr6M@0Wvqszw#xxXEywd4j@ zL>IHw0id$w<2l1Z%Bz%Rui@jevQVV}7DTSBqN-ysbSe8Q) zYq}@WCh5a(42a|7zs+6FVi7Z}zIe*#D%E0Zmjydf@t+-fk1l!*=@~^@W_G8NAy^JS zHaYGo4k`H1pv-R))AqNLH@H<{Si}?W!(W?H)o^Vb*d($Y1TMxicE2tG|A-mrOgXN7 z>ZnaDx#>1GyS+o8{ty6If#v0Q)hLmP)>5LjhH*h^mvXY(N`lClfjUie_eMcyaiQajpU^+}e2Z1%p z188eej7z4V*Ron~4we!dt_?}bWcZ>`2eE%2*}_iy3P;)jzYBV<-?j0DDvaq~4{9Xc zU?JIc(xl|{qzmS;h5`S^baN?0kq4<3@Z3c_nX1bW7CSVW3b*LO@8ma;I;m7^uzjff zJN3ph{nnbvfDJi8vPQkeicGGHrMOPa$U=CE3@GiEaj-!W%rR$6!Jh>Xc5(~CNN%<) z1Mxu2R=y#x{i&%4o<6~b_R)k`akue_QD(WVSRSgc{RuNZ2bQ@G=~dYG2F7fa;|zL=gWP^RN)I3Ii^HxbXDB}3GpSgVd{sK zbqfDN53fj;DLnnQsW`d2%B~8tyUG8nMN^o+a~#MQL57IN&~C0W54Rg9Kj z{Pr@*=BaUH--*xE95rB)V%ghV7aYzO8vtFD=9wq4>fEf*po9Ytzn(7&qH1Kh*l#z} z!lLv&FD)jfnu+*5Rj!wYsT0&Y|9HFdoo&qtKU*twyv+j-c);~bzE}07roe*$WmRXS*+fe?u%t? zmSgYnRgi)Os3*iLyDAa)M13pji2~E_RqM~ezf64{ddEfG4`2dlhO_E_^Qte2-#RGv zD)@=X#6WnIG#19!CoOP$$q&zqZw1$GRE_17absz62Dx^O5>}G+)r6;Ot zsT(>`tGZ8XHP!KCl~s;Lkm6m5N3nllhT2-(EKL*C7U@h;z6Y+_Yg1Faeuw)qa?G>s z*G9ycr9xEoozY-lUQw=%iI>iy=WwW4MU#&E=_?%r9D2{6P%aL)1(l)@Y8TC!ojn<# z8Y0arfikGZv6?=%42#4@V0YJ!$P5oWB_X>G^=IHvWOZIAB?a=!T1|oGCu@tb;@%YW zFIaVoo3^?{Dh$t-bPgP(LVh`Ps*d++R$SWW0-9PLj5Dr0V&1N>IT^cKZoJE)@<$Ar z?+mwY&Yi{zN&K0!??^wNB90B#X1XujFnG27x$#y=V&N!Q058N)Vp_21+pL57p z)l}oNu@Z3Yvn}jtwW5}Jr2c08l;q%5bIm!;^Q_iNhZ&yDzVqfh`)F^f`*M(cyySV~ zon>F*jQ0dYr`&0xVCI3JuRXu4M~wc0asIHvGh}s3L6KDPp0~4}Es54=#NdT;KF5}0 zvu|W>79(TS#DhPy>T>VR`q?)M zD#Tdw3DHxzCN_SLdPN7jz^5{5nj_!2FSRRkQHpf_$|a|qtR*eGgj}4BS7Tpep}#3N zU@;ZJmr@M(laBt#Q<~u+W&^ajGZ<%oKwVknQvDl}Is5kJZY4y!BP^GhHT8F@TUsjj zD^NA(-QVNq%Oz>$b86*&Kd4+Yv7A<=#qZ~#jf?^y6_LAfWa+L=>dE9nydt*54i%je zScydDqhh$QrA1B60_2z0_Ba0JtJ#Vf!xpI@^Fxi5M(GQopg{`@<^eu!b2Ck!-*hy^cUJD6RFfNt*;#w!> zxsa9ZJj&_vpw>tuQh*aJLEHt0SVk>Ad+R|o|CZg21oUd=KM*82ozU06oOvEq$(dyJEw_Hu%};!tFF9Q)NPOTC+4b}ufd zn0ZlLZts7%tzl+(brm-z2>ZJYIJ_j0Nqsf!P_GJX`OUGMK-GfUhq2-6hN$bT;z}Bc zAAg$O7=w*V?yV$({uQrj4*il`H;{Gd!FEVL-Z;Km_I|DTd%5W(;;O2}WztvDk)Dr! z$qHD>#s3m}4dwT3z1jB@7yBFo{#_#MFloXihdC1aQ!);;|GnHJy**5S&LJVxZu${v zKfcRCVPPtlE})nE&TJUMycmYH;~`}GqkIWQjY|CCWE!E09RCI=;#%BsQP4y}Ax&sc zv2;mFjf51B<8}>qhl&&jSARrt?)GBunpLyp+Yq$kZpv?z2$=EJ!$I&@f#Z=3WnN|f zftyw-G6-_ZvOk3i+9tEH*M%<(#|>(wgEL0v(#hoTKolUqo7^dlrIT1R)*;6QiuVpL z2E=FiL8YGfOY%dB;_vsjbNY9WA`R{S>Um3Ey*vGax?!az{esf(qWK^K9CUo(gYd|p~$Am-nz83aWF{!qrO6G3#By*26n-MU1J;E5PfNs zem(5&#;R2g5{zx#0p61N3^_1F%$^m4VSeJ53=gP+tW;W46LB$;_&{q%&X?lzoCiqWL#vsU)>B!OJuRFH|z0@}mh=|-G zc{M>LVz8g2v+H$8-Ku7pWa` z$R3+e2fO-OC=0*Dkpu~z*Jv@jOz*^lxDR5Bf6lc9{7;>P6NwS?PW-j#yX&P}Qr2YS zy{5on0pp%~LmH2Fb3EZM#b_^`v2 zwU}4GlTuI{=QA;uG^Zx12)uW1QT@i8IW2n1oM}lH4Sj5pwyYY6fmto((y9t%&!3x~ zykcEGNSek0!Q((Grp^chSD#yUSC>m0SC3<&OQ^GKWR2g;kFpn)hOJS9flG29%t$# zVvQ-}=?QI+->r|PjQU^pxqI~UxO(hV7;SZ*J0=h!(#vA$&i{(rz8u`6xpeox zB|{1!)*_LKv<=onP49-9r=N9=3IZ((5!&j?X7KO5FM*8oqF$*#GH{rW8-|5)P00Yl zhSldt3;35p0hE5^c4i9fb8d1%znvv>zHLL@7)uhK2jyXm7WA=3r479H=HfH*i8$+- zvc2_MPCsR?^5$LGaHFgMu}By=5p(6wQDv0~&6sH|a31VI*X%mJHTSwVbUMA6> z18pWOTupjGmF}INGX1@Mm7HjH)8^ke>}d2SEL;Z6h!M<+eA|3pQSRNMNruFrwF}#X zZDWV;f9m$CNI~W8P))uO5cj7dVLmguYXDN4za9>=-aI)#vZ z7rnpVj|O48@eO}fNa(xU7Xv4ErUTAUkx%@IVYsuiLb(yq274F3>LTX3SE(xPcMHun zdKia}Y~315<`iT&T7Tdp!H)Z?Z#X~ggzq#ud#O~`ToXgrY?;rLZx?(~#om$WFvGPL z(Q7}2-K;5{WNXyT12g@KjX6v2n;XQmn7En(HR^AdHdLXCt`{u%RXEVC%;3_kad}!R z4sdPomoLun+I)-7sUVYBbzWj)_W`y3#GAKdu^w+#grDx@TfJ)9gr^RK%F5fPO`ar@ z4397W;(=s-?*yLi18U|)!phFjnuOZf`Nduc((~f$XVqv^9?~cTmh6*kN#}K#RL7~$ zkQMn?p(aj6hlBITX}+pr2Wc8zu`3Ar-3fkd}9+ILh&ZXDnn5L-qtoh+jQ&y30N(-T1tDS)i z%AuR$QUa&DGQ-|->X(?t?;cT)**AIAmm-)`30Z0Zsn4JT0Sz0bC_#SqPe;`ARdX|U z!c&69?sb6yfwp#|gP`*fqxgjRU&8!PtQgLJD!-HUV40Js$TFbWXsBNkIA3yvmxxT+ zwuqvp7Z!>ad&`r%y->8rBcw!O8Z>OR_6|?ueO>CJH``R#sfSz#CQ05IrHZDP?%mq; zTM}-GW>(82KSmI3bg-+u)y{`8Zgz0((69nI7(!4PPZ9`LzvWrxqzUMRc$YM4l6|3tJLC zj*oG@CteCPZkXza-)+vXnh=e+@b~Af_JiA^i{HLaQ-QH2Xsc;bWSJob|sd^-FH=bf%ak9*F zsZ?R)sBovuF5yFmbDm|IFC31mM>@y?He zoxCI7@`4Q?(Mu2_TzDDv#vT#T(Q57MNMnWD|0i%lOjD-73=(ex&z_=at#1X)yY-i= zup1ElN2Mbj;dt$3%tBdA`4$XK7Ue1Mq>KE7x?V#rdQ-oJxY=j3;0Mw_)Q<*r%FVxE z_CJ{h%y^5GO>1s?^Z31FhW1>R*(lhQ5gdfs2(!jUY>jMtobxtlKah?aG0~A?*zO5~ z?z>I%Hzd-+gE4s-EmURu!4Nrq4nn!Q+x42}kp6BTzD@{GgR&{h!%^52uC+g5GO}!6 zC206Co#>VpJzm62+R)hcvZ#n0=OIp(5x=zD&srlsLu5``axn0~obR=aA-@H=(=x97 z^>|7$i4z7)er6B5{_5X^my!lt{Z|T5XbI1lIfSljy#bD`2)*i69_pVtDrRfZ3ET z{Gd9a1P%>M%5^$V)L@JA-7R<=mzw0xeDBvQ@n~IfsZau?Bnric$k0k9R(#y|LE#VR zL>fj;70V@B3cznuGw`||;o5shvh{#-*ycv}O3U^nAMvgG!x)X{k3CH1hL6%{af-+=SKDyA;__I6 zu~A$k`%^2hw7612i(eVb7YK1c_ZEU;%8|>k!9(TPP#MS=KfhGVk@_h6{BBX=EY4tV zswDkQu`U==@|1$2)QURyWvUjwx%uB2FeLTc16*Tp{^_YnXzh6gN26bF(b(LA(dDLH z;n`0aN0w0HxkZAPhWZ7@DQ!15{e0j9Yg(4UO+d;<|L_+tuLcLOP=*bD{$Tg zX^i9r9z!(?V)(@RXwtM=6tQpNYTY>78460 zgGw)r7V?x1+pW2U>unx`b+qCS2>-X!u*1w=<6c=lI@XKmCm28Ti8fh3y~SExeFQa+ zF+q%ce0mM){|Z!ELVHqXP9a#`G2yY+lu6bIu_p;e@(G*iTz+TLdWc@fE%RV(bNDyI z#Oo3GR=H{CuFrIfu-A>#c%|V^L0j+#nJL*n!{8<15*p+$zP2mY5}ov;Gyp0Z;x12u zbKn!}X0zps%5q4B0bZi582cRApldewg7M>L%*JMGMmZ&h93@_ehD91<(Movj56k|f z`IV+_E?(nZKnYKZT2zIQ$r0`MjUYxh|vaG!oW&IA29hDnLis|r-vQa z8a+bQs;L2>R!x`qpvNnvAvBO{Ff1iuMiL%2hc2=@{bGRRamOc7V?>?_a+Rff)RAp% znl8|+QADGFa@fhoWmPOl9rSeL%<5wIj{mm|1Ni{hEvC%VD{%poL4)ud0dITUNA`#B3 ziEKJvuw(HGo0dOV_YY$*k)yf>hRywJKOEv$ZNX3YTUssS`lOUNPlDW z1%_!J%;~+(|8hobn7bjI!nEqq%~|3P9Fol=bCKD{Eu4)PFsS;b_9oeSu+>_J!^cfT zvG;LaZZ}v`U=BK%JXWCS&9iuTe^{40bKa9eTKIUz)7>RO9k1YwtY1Q(P|$H_ZcUZ; zGbiri=>+|7fh5E=ePdqpB8MbsWUQQ5Vv{AvUTTA32wHdFaMnVtwhP3kf#+iPI7+0- z$387q=c_VB-qIu7LK}FIJP{ovwfx2W3)AG0+tJ0i8`)gjFP0KsJsBV3S(|c!1S-q( za@7RJV>!8e>(ui1UU%v^lpW(&>l9NoJZ$qWQHY30Y11%~9;G-;(}z!^{dthZ32eWW zh)=S1xMu$}E$gQq_yIRlC`m>Xn^w8cH=Xy7OWSelY~ak1kt8yCw%bBNr_w+JA}DR) z<9bxqJ?|2yY8xmZd? z`6E9|Yg9J)ze!7NDJqNb89z?1Hys=Zu%Z34d@!;klfQa&c_^Va-syLn@j>+qxM zVjd9)Cq=(MeK4zL(ce2N{6)9UG9`K~`$jQ4J|mzyBUQR>cGNU9dCfxMIg2jv)vF}f z%4+_f?(7(3X=ZJe=hV8k3`H@VyC5kJ69<68HoyCglukCX=uOnBly(#-r07{q;@&H- zccV%}rNZqVAk>!86U8M+SV!qv#2Ejc>aN9}tjgNajUfYerMh`eLqkIROeVm4z1?o$ z=s~VqQr~RG)gOxL;D6NnCW*J?yYKpGcWhdcjcZbEe%-`R=4c+?-@3C$g|7YA{gTr! z5=!-FfwByCvi7N;o|3`H;Cm*SMr!Ecou_irNLLyCz50os>y!XfHwwo%*lT6~(kfC) z7)zXCF*_Wo`}KQ2E^;x@FDMtgKsL_UtUyh(?Y$~Hji$bZiX#UBL}2mFD@jAyK<_3G z*w~AV_c`C+soft`<&ajONZs!T)kimM6m?};y2ABN@k!!L6ThFyJW?ELb_OOY+Zm`H zxfiVn-V$EVH!5CIl%ss_XF3<`G%?qohL~F<;g2)obK{g z!|UuVw9-EiHEcur@069i6KnHRl2p`|)PY83nRg1?7o%ss%VKcZiflR+TRydL^aC{D%UhB z#J<>Tr+yd#@I&8R#omcuRc#efzdE)fBs2~_+g?QV(hYbrvnEF70XR!m1FE=jQh2z# z&?*DS_d`1wCWeaPR9R!4B_LD-rl8ftwulj4&DP+w0Z^4YluvT*QXNDmlU{O`!j1PvJ= zX8rXAO1e_=0~+N3qTDUZiRC9CbldG%ICRqnp{Z+XP0NfYt*1d26fH9Y{r$Oqb^}!p zQgN2Q9Y<^Ou;JYw++@Azh=g6~sw4b&;}#gA$-n~qFp?V=J_8xIy~*|YvEDEwD^Poddb(90EjuclK(96 z7RIRg8_-5O`a+w}^X^fw%wua*^Y4L9Jeqe&tTMkM0VYEc8Sy&Sgtgcyhub)WnQGnMVf01Jw$um^sb=7bd()yis;@UKwDc;ca77{zI8D6d;Y`Z zSA8a+Q_jLl1|v2Vn%-tbE;uz04^QXr zO!XSiqsl;304&b8sK{Zl#n5547IGJ7f4q?uNTDeG9AFl>#1-bhMw0mJk^%?+bPGGp30{()S3{Mv`;yqA0A7dw`e{jNIg#V<@A3VAnl6`b&ljKT z82i`244Cbv$~laDmL6ROcE?ivdX-{e#!hJ1bbB_t471MUPROHxBQ}8>l!Z09(>f@U z@8cNQR00AAA1ZK_*y=U;R2l>TL=-D+)hFL=g#osK%xtaWphe1S7-OnJY5~y8uSp4X z?jmS|PXY2r1VG&Aj?sPKXO8dwkvW_P@*a$iq9<}&fNn&{QA4E>Tk(jC3m>%t1=4fg z&>aON2@`HffRV7r=1U-@dDY;xw_US8W1Yp3r-izjYW2%Rm);o?-eDhY@%QB|RaMFo zH&n_H*1A(clE473qS5|HAq*YLdO61-9_YL;gXi5`6-Z?)UuC&#keN8_3 zt9H=&B!$bE^F^)v!Bj=oG|=k{bSzdFanyY(w|?~D2~lm_ZMGumPi~WDq_PMEj^mMZ zER(I2VD~-mMFQRcUM>W%6gTA6*&bV_|Jl3CRxV`zPqAhDEP$`jEXtYCY6o{28uuS5 zRH1=7l7JpqJ+FqO@k3<}Ty_C6M&eF|Zl#pIdQZ8_W>uKemp7mAlFlD1^#lBi)gyV{ zyxcqH9`NP16u@ZmJRH|h`H|J=eZXlHaA|odQGUC4cP3D(NMA0Z^liH(+&!c!uZ);U z`J}M zRJ`mB>s@m6#S?Oq7dycaU7B-@+Q)$TI6F}{beN?H*9#;@UlJy`5P^@95d); z`_%!ij_xIimO7~dGBoS}h##DHw+DfZxBwv9Ha-X4$l#&VDAneeDL4SywXz&mvuVSe zEOhSx7ETgw)6rVLFJIps&>I~I@O;{Dqr(X37;H4#ieC1)0Zp=z_ig&=u3h9~?93Wr zxAXqz^Hr{qPMRy-apyb`&;!t8AokQW`cr|MM*qr`RJq6HyIJ*-rSCVv$TE-t--c%rsLQx|)ZO)(XSr%lFj%1e zY}zl}ceMM+4|nU<`Qw(0<)=`*Z}@(dA5dD**D0Ka7AflK?xKML z^BxK8=8|1Bmp)-?oV(A5=K#vWuDJCtmevR-_+QQ+d$qF`DQa`X(-K3?4TY{^u8yAH z;%1#&4e#PsUgzTl92a9=d^F91BUWq>r2aPTNS?ech|5&9wYpF3w<^@)UE zV@^j#x!Uhk1H-%dktI%UQ&VqyXe9P`)-3h6&1KRCEC=G7Kt&&ESgsvn-E?Gh`{$9s zK}?(x;YK=2K<9w5WoaF#_I%MX>c|(EZxW`NJnHjBRoBglI;?D`m3vGy8PW?JWP~;t zK{2h-x5cy;9b`-i^bRU2IkLW3S19iIT@S@7hBO@s^l;p{8L4?p#OU<+)&uX9Bj+L|;@k%|iGBdgGH1)xzV0X)MLMl{Ubnkl1%Ei|i#n2%!uf-sTA zg_*2K!nf`9#Fp^CfMMPOM_a#>cF*tPQOIs7?iR0vq%v4aeQUkDz1}H>K0t;ZhhP?P zg4tc?KJr1(79-6Ab@63MHwbpp3=c+%wANq45Xl_+eZCLd5A!}m*cY8u0M)^h2_56! z>(_CcIg%B0U_pK|>QuELOo4JL#-^XB&A#N+)Xnm!Fm8&gaL|KP##H;cKwaul!bB|y zXAq{iQgEw%fyQUsvBr(T3=y4$MR}cd{80`AdjDwZhS%E&Y|Hi^awIr^G8+Ka(0s23 z!C(_WytKq1{z>qM{AZ=vHRu@Vq-77;4}PEjd}Z{nCB{CMs2u2Nv+udrF?TI?_dSx7 zsimfzo@O^u65%x8&^!FEZ4*$npH4EVwz&uNbJX>q2-Vup#>HZNsfUUCLaZszXY`$$ z*UF=cDE7a`)64Y@gCG}2=u`3slN2`pq)cYG`6jXJ(J1M%T@4Md)-=(&1SbD?65JwR zVY-W3)Kh|zGh1K3I;v`dJodio6sIIH>$H?58I$2=&d--}i};+WI4ulUHysDSka?=m zdE{?jz6ReyZ;#`Kg4;hyQ`UC0$6dZhCU_=90hHIxY3K7n!%a#uQ?7eQ$5*}^rXr8l z3)cQ2>9_+&1>$6_Fzpv;QqpUb%xZ)lFif8I*-KpX-!x$yG_4vR9h-pTF?q2xE*)F7 zofe*+W*+h62&%9iT-Ift^dbAzj|Z8Wjsh+&p9&mgGVTB6>36(de=*od(>l)vuz1!c zd`|c)MEwJG=6wo$Y2GwIZ--A)eAi&uNwyDdf^rA}qp^i^wp^Ia{CuNB_6j`q7Xku7 zH0xd~8}atQtGFvj>}2H4(G`j3v({jx1eUqQ`ebZT^!wiJYjTr^aX@=Tf!wnor)W%Q1na^F5+W4D&1UFT$I>l39PKwRyc*`Q-GJ_5l%|02wpNYxz`!H+h;q(s=T$f@^Tp89F5 z78ibb1dloha+Si-77PSY7q)2=o6(W?Mu;cz`M@(#p0@^grz*pl+Vf9#f^^F6Ee)e1 z#0lzyY2-?=X(cX`1Ww}6?<(bfu7Je2C(x1KzBVVl2Dr^@BvKs;iwosA(^` zs-U3YhpjNcU8z^LDKcvW5!i(15k@-)LSZQI{Y_7mz=LDgcLP54y`;7;4 zjt{`e2R}Bb$t~NiroAL(NAq7I^X1@v50JTi`L8`ljfsA;qkAyPtm6K;>+o{TO6?9S zcSoDmFveg84kdL^E#nQVGuKOhy*h9TD}l^yk6CB=-Tg`o5tw(AZXvi zr`Ba}WKemY3!Cz33mV#m)aw{p4a{qR!T7!Jcsxc_Uf@W^3z)x67iVg{#sMC>ZGM={0Diei)peNR zzwqv;1s`Z8*=nns0SQdE@<28*&H1fN(KP87z!0<;1sp^-SvWve>~-EAWtI*h86^sm zT+>y-4b#c4{MbbqWXsd42GG=2DC2QcMZw}vAI`Q3Yzl9uIq(U|05=Im*dsns?iTbI z%OJ}WZc`w~yOYDfwA7iW_6#u1j_lWu0kICdXCTf1%1Syw0nIR4C9qv{%xvc@UPN6pq=l9TafbpI{FEysXf1=X^j83XJjL7?tzmm<*23dd z<0TP#Lhz}iB;u{-3P6d5JNUptPDnSug!%&qLslLLG;KQN@oECpBwrV&QBoy=*JMn= z^281DW;jnMI6h8L1C2-CCa0Y!?Dmk2*h6h#ub?RgIF?H zB0I$r0r^tUX-nK)J=pJLWSc7@Ebwo+hRW6F!GHvPE` zA1jk>Zpjclx;G%w688E!Rb^?t4_rvK&$jJ8G;>U3)rI%}ae!X{NS0s&edE#ji=XZo z9Wmy|*6_bgCESYPwRG=s&@m%50*HZ{+O%0P`J=O_>-Ol+ZK2CXZT|TIOi$Hpq)$({ zL|hD2jV6A5MnsT=YUN-mAg#V1d;HijRty+|pMAXGJ;j>>anR`MM;pT>vdPUeKZ^%? z-2L>9#I6m##8K5`U`aMha!i2*8>mYhEU$x|ZVb~Kq@Z*G6AR0Zir zzJ?=$8?tjs>3&{NfddGeJ6v;VqcC}LeQm&W@Z13Jw}3VtPBR^WWED0nfIfvWcKQYx z*d`GBFh?h_EoB%Un7?FV1n5@cvo~15Sgs zGEY$Ot2qvaIVw{Nkb0fO`y#Bd%wxY%Gd^moT>5eR2zY)8@hLLgeJWOyEdVNgqY&c# zccM=PWm8``JfET%&UBJ@{r_lt>!>Qfwf$EmlokO=K}A83W>L~0%>rrZZUN~MQ0Z=1 zEV>pAQc6inryvN@-AI?fnG5%OzPk6je`ky{&iTjT(DlU3XWrNSnb)Lv;v9UJaRTyo zFW9iHJZyL7q7_C%WR+q55KUht_nqQ+lunr?Wk>|iEQ$}4`{{0ho8Z%-<*>`|SEB6I z%yWl`e#j^HE1bi!$LNF+(q_Ffxq17f(P8ch!NbQ(+ZAcEL5};=WFiaCx83|pADPTk zrX>=PU6Rbd^{ch$xDF%#6;OcdDT7{~LZ{dK!RA{dd()>S7S1QU2tLmC7wNMm7bYRN z^@|em0tAq?Qr&O2Bpx*sfr_XB+e1nhPsR?{td#% zy$uU!LRG*leP7=$}h=G*|4 zlInFa_B11)^bMNWw;LExrF~@k48MT*MfD3j_T$?LzOyeAq1+;mJK|dX~6=ur=K~dUm&Za+kY%t{cN2 z;)G0#>~kqzO^=uy1MlhH^T2sO21;sfBxszBUT4;fq%~71$tgOZqlaOBVHW~!IACv2 z=>4F#)?O>BOHbMEn6V(F!}-2o2HXZ?r%flFm!gd_J-FaBnNIgP++3s;OWR~{k^nzn z()V9coY}830fUpYQHB3s zo>zSxD7-}Zrf&g>`lpDBlhww!%^ZhY{M4(?2^vm2v5%HHwRftoX=d1Ewab;*gIuE# zZ6L3Mc3tfaQXK~wE9DhmXO^iiK%-Xh1~aQ7miSU)rsE`KocU_GQfu47nhwxQ8tR8W z8vvPk#OVqBOUy%LYa&1htVqNm2C!65j$mqqcgSfufX!;;$vTDU-~S2JXEfkOuNQnriPomC%3PZPi{; zo7!h3N?LdHKse!0BB%DAAx9%rcY4t?0T;#CIymEfww?eAp~@W})7C<6klu)R*^+-B zC3{tUx@K})c{~eETbsJ_)j)wxcgfm8g>0(e8p=Q4=cxO9)Fxl9vLa-vXrxb|$zxL- zlJoJ0k>)F5IbThnuhJO>-$v5VGhi-(53!H z^J4gV_b4V>xMM}h&XLttn9F&LhX7t{0`_v`zA*d#UI&Tq=L0%W9!E@I;)u`eI5kav}*}9%9 zpxrv~QMbKHp!ul?qid`nxR9|TUtvAPP*xk4@Jhy!Y1ci2FK^?s-m#5l=jyF>Q|^=r zAAgs@d+ACj=K5C+i$0a*NfF%S7V}jk8C9;9ek`8wq$~V|JdEfjGU;delE|p4!l*;j zV0ETu_b9oiF5{MA22Jl}B*^u}S*>d#w+OJLho1D&t%~sLt5P77M1}e)5wmw{R$Wc{ zEe#iwWsf5#)T8IVtfk0nM9TsA98K}q)yJ25$`M-`Z(UpYj;~wuKTY0}JqJf%1iJ;O zd`_(-;{@HhX_zW`>+VolL-~}{V`whzA1$Xv50)@$bt?R+Wn}E&r|AQJxXwW!sCl`K zhtw{V`vA~Vr>T{xW)ZZmlb{K*P6>h&N`Supcu2&*JdaIo` z-{`Lz_mdIhfUbgP{41?I1)~|^&CmMlH|UBh+cum`atJ^jGME<2Tyg7^rymc(=;{8d zr+3OPphFeRMUEif)tsK)P^bSnQ#$Ccx4K`Mq03lW)^9JSsNar0a7B+Jr>)x}(wv~j zG5$`5UnN|8VqSsHqyhd8Qt|=Fw9qZOG0=o*5NlU2R(m^xNDk1YEj8=^^A{jF`sv$; zCn4PU&wo}_gE;~TRrfEHBsZi6F(z-9#|1iB0Uqr8=}@%lX=g?C{GH{eHO@8RkI@i$;!wv|;z| zv>(|JOK|oORQn$W5J((K?HMtVOIiWWmb;FV-7NGdnOcAlK!vE?fBY zrgQW7Z*aXFd^8 zp`_Qm`pec%pf%Y;{F`@i z;#Pm0*1swuc(O^NCC@qKZ4AdPCSk^9qFVPHp@s(JCv)31YP82o!_5t{=qmnbl~+Yu zv*rVR_by9g)d<18<;)CC zplR$Q!zB3mb4buj1uZc3ZJR4~bY zVVh#?d5UIb3ZL~dSylI;b63iw`WD`@>F7#$vWEcmo)fOsiq^Oz%BW9@K>l&OxG%i(|^RvrMAOc4xk1qD~bbv zc+Hgup0q#X^~__Iqu^!0ElOaj?QHT1oG>igVXSRGPhmXr`Z;PVr=d-Hoo9FXhUr;i z>xJuiYr0BwcAjX8I;9yPpp7e5W^4$uc3LhTyt;{)IZhGG){FR12MlypNNXNsFx-p} zzx%^YkQwLB2!D-7c6m{1>mXxy>rPVdS3-`ucn^3J{5CgJ z1%*PtW z?k>7Q)HaXRk*f+3$L=c)ogOPUJRQ)?|aJzy%`tg(05s)6Uk3f=-mK~xunua-uiqobNZEV zmJCE%quA(@;d&3g-UOVleR_sV%B{ugu9vVy#4BV<9qj%gFL;mr(F`t9FiD`%*=DQX z_OTIiPbYrSQ#=c$@&1FgEFZ}pKq0#n1sm%}=n?7#9rkWg4?%}~%sOWe*gt>#^`Im@Lhk-U1KkTB*qe&I8G$R>*J#Ywz)^$TBt*hGNr?W z*+1c(z;78Wzb1L`;8Fq=b3U_&O%yPt4|;Fz_HI7x1@0){>Gb59ZH?w{i~??VrZjoI z@LbrgP-VBMOy>Sb(@Fd8&AWJ&524(6UB6aET3>KM-PE4n8Vy3^z5ZjsKe6rFxfMC| zRvdzgkI63YByW$68-gn!Gqk_X%|#VS^;}esa{quuzX?#^U6`wH9 z4YGqlDJVgkyndHcc_KBnxEGwn&5DG-l**$6> zS)kktkyj`$;0bktJ(8V!D|IIV?B=~iz}vX;L?0>KVDUN6=b3KN%TV<&8qw!_@T&$7{m zez!do)(up7d3HpnTmiCGZwO$MRzMRFQyzx4{d1e$h?qY_v-y=^yRIm`ywEdvHF6m!+|mJbVTlCvVi!Yep@h`m}jc=-w0XLceCG#j7W- z{jHV$29ZFEWKs`AxC|@Bkc~C*-bi%$dhh!3Quo-;u#ITr2yHUf=rltEq{NGchkm>L zCN-+4|7EOsGf$ht8n+<#(Hh?jyyG0Fn6{s>FP)EanH#SNZs6G8QpS?U?YyQz0r2rS zUFa*}q5aXnS7FKyO=fAlWHoR)g7jsZ_4D^<=dF+Qu~1 zZd#61nXJc;FYdwi(st(j>>g%XLDaW+3;1{&O?$$G^-rLC_J069NBzMT!Fpm6MI1%pO$SW#x^!U9C z^Uv3kO2|rxCRkkN;7^H2^;%a_@=W1x+Dl}$Z|5u>IRKRMzr9yBU<13`#KcX=c@XA? zc8({m02X7@70Xx(JsAM6#>O`OKT*lffZKmYB{F+$khoAR*drp9_lRCI+IH3=5KRB# zv$vD6)YgTwnYQb=8r67+{7X`rTUgdy_>EGUQZRubrSp^jX+^j+Tc16<&eZNL{eh;k7p1^LVh? zed>He9;g*XRz%`stErvVY~;~4m1JBX%!Tv4c^50LaDfaS>O$tV%l(QnGpHT<$t|%8i%Jk)yWx+eY+P!?_kgcvwDNa zjZjTW_XaN!U)^*UAojt1|CEfpv!i=cBot;}T$Kv-PrTIkP73}NSo%=*N}77~{3Vga z45Tsi;e&CAXCCdG1KV%D`UyQZ<`z;J%K^$*)Db_B)-Ya_bBl;vB`nNk+-m1z%Mc#3 zay*`FS1k-g3Pl!VEk4g%Q2yi(h@5f0i69i{SgEv z;5(ZGkjU41Kn}ixz)j_$69xti#*0V_|M$I-3&ufQC;`Sz0Jc5J+j1pWvL@)hITEM6 ze@rf`oJIG){cP0`9>eKS_yFwJmyu1}Nv`ArE&;yJ6V6bR~t2)DRNU|0`hzrte4Nz(HoVG5>}416fp7=U?pxD$LX9 z7C_K6x@h&C1eyoXlfNPB89B8}Rsh$lW~BD03A<+_H%@DyvTb^=MR~HD&q}tK31RAG zStJVTnL7nwd0@lEA4Qagri^w530!Gq^b??ZeKo*j9%1H>2=qZI5W)G_#=QBoEa&6h z?wjB4dLgE)9lrshS|sl^9||Fn9C9YlxCaz%+(Jwal4j-kgrNq)ZuEvNl5Adv%->cO zx{Q&@Q_^sHdk@eMB1ozFN?!S=A%?C+MvjRy0BeHM^cL+nh4(DwQ%M-TUwp*)-xSI# zG^xEtb`L(^7KpKtQGjZZMj#+q*=1y5EW!@!BfYYR-2iLt$x8d+w5j2m5_XT*#Dpl( z^ZYdMnd4djr`0%2-C%2?dYA9SV4})4Ln%jY%d{{xAt8@Zzue=@3CT59H@F?Fb?(&x zUe%tp)VTZZV3riYi5FmM7g*QV1~dI(5c4QtMIPJTlPWl^A+81h%tbrk7)}HL3cI8E zaNz0K?dTC(@+fV(9M}*O0(#IvkV?=~3xJue=NsVw)(s|lstW7$3S3w)VznG$U{^1* znHF%>RxQ?|6D(0LZ>yQAVwMOcQ&};*h5DU&$`DA=k9@!&OIuiXhNn%}3Org0w|+#^ zzit4UGF|oekO6f9>_XM-Qq8IoAy(tc$|X9JvP`!jYvd3!J(`=obV`U0J1Rbxfgbw1 z7VZF?h&sMF+wOfem@PxB0DN58)pwh2QUw&0)Q_KQ0YC86Hknd~RmJKafYZ@Il{UvK zzf>=$JVjz9rMC0pCC2(efZ%z2+3>R&>x&q%_9Kho+*!=goL6IIm;F<&ZzUA!^nX#g z5l12(a&A1NfB*2|t@lAe254dm_@#QnrT5yp7z(VT#LUY9cQaz{>fcfEtWCb>Hda!+)LZ z6#&Zcn^w|erka(9-!die>`Om@?lR2HV-Zv5@7i`oZe7)MQdgjymw`$6B#jB+CY7#x zCWd&lz&t@ zIeB9%4`zscOSGw9Rrf3der#j!h+ed0SD}dxH*crBP@-{GEe-b5TqKN5c3UcHSGl-x zvXHtJJF0CFtE4Qaf_(Xr$X=n$iAiWb1PW;7X!JLLr2aPPJk6->?{LKJz$|)qVJ?{Y zF2TF9sn71bKbX|Zo|n;^HJaA|_*HwZ5sbfun)Po5wBoZlsh)NtqQo4pctMBfVXEm& zK$st_eFJ9bDbk+?PXSw+k3=#jJA4&L6KJq#Tiv&W9>BScE!OnG{9t%3QV=PODaPuO~Z|EUcwwy6@(;;P?Xd5;15 zXSTqvpQ4TcXXtd6V!p`&^}dOx3Pz6-aPn3FkZ+82mi5n;RwPY3ZD05aOgr+fT3SgH z!N;!#_&F&+R=#*AAr%vRCY@?^k-SyHW6U@Xi8baLE-qZ-EX)%i8|b5RFWTEHlP%awJCsu|kn4Bt5+L2( z3M|7*J&42c=%=%s&a8y=RJ#3oV<6t+#ILg#bMR}CAg6FMzQnDo{`BBEY1sLiL_ZH1 z3!+x(aA)SwJZ{!)!WgCfkXwQCLo$Q=Wg?gHXPcY@+phYHO(k{2!R9`JF*mdGSVwDp z0zJ9FA~_FNk?s<9r=2=W{>w$S6Wry_M2QO%fAjGg#{ysu5|WdcVh$gedCtZ5>sh$? z29cW9tb?(LD53T{i;+S)L*oy)WN*k|x3tJZ?g)vJw4W`;>t2<7<6Q2Ud%(S%Gfg6bF8UIUs?*qTx(VOpK9pnFXXejhowj=59L})nz6{ryw^J z|Dc=H-}nv%h4WdZ`6+O~cv@JI(XA}?KV`ZO#<$A~KxJiW_tre7Xoirus zGu*B*OmQ4h2R&Wbxu4hNXWHpTdH+JqPV&2*XRl}8)&Ps`Mfae+w>y1_TqTA-_2GJT z`f;y|k2ijbzUPcMo|@IzA)Rw!k&D-|3mhgV0p|n^D}fJNw)LO~f%bZt1<-od!5vlm z`SwCgPdeWt0*!8}>SaF>T!E86eJ1|IroK+q)nwm8QBdCS@GR>>Zp@jz&RFnKDs1D? z=Q7WoNxsmnruhcDuG2Q>gBYBIh|aRAVKx{F%x5aIhTAwbd}W4}$ituk^pOcozw<2h zyIvU#{J{7;ULE*rXLM`S|L8INT2C@INatREu9Q+{J?;EVVWdy( zTdu#&YmE1)YSd6fLB3`qed)-hYhrnKW(bGRSi9kDfUaON&99xoYmE*5X~H8q2m7F) z+hS6EQfWy*=L>y2@3diul}<_So4lMJWhY>ZEgMpsnDPSxzUcCOP*bdHWxQr@<=_yh zsakNF0kKnIiI;gh^EL||*b0(b=dVDgoErgI8F(U|**EzO*L!F1zF3@CA`J;~C?~W# zo;e=<7ZOM}=7Mx6>+5c9S;B^|g&LDU-umVJSV)-pzO=lc%7VbSk9CtEz{-BH>oE;( z_r&N=fZXMOnsV!Q#T!CJdFxkoH_X?svK}}3bs`N6<4xaICk9vsJzfk94BWl=_`{k& zH1{38#7FeBvj;ssXV3J_^c%0sM!V}?8+&|o@zdjYAzHrG#T2!k_BgDj_HMILUkbmb zPU7U=gdx0V%aw&6>{g0mh(TYz09y*WEdgYxeX)pD0UN`1_}cF3qeGxa*Cqdcb`VF-*(5N- zF5kgV>S?Vk0rE(fA!xr9-0g6K4u>1E;OLDl;&&Z!6J`hpN1;4vfVGgM>xUP+m@Q;B zh3NipoUah}Mvqc??X7#DKQjQ8Nu-%X!!`p?mM~Ot*2h9M!E34iAkW23Q=|T_2ZnfU z44-IJerRdi&wtdxBLJ-aknk!Sq)5Lw@XV)+l{bdd@}!py)MY{Bsh69`so(#}{|mHW(ox5hfxA)V{kpd0^?E0x z=%1oD>KmRoJFB3^(yL&OL#k=t3yBv4yfA?>o(T8%qF$5JF`Kz}!og3rjHUO(J{7 z{SDI;!qK^U(A)6Lksagi%939{*5uR;eCr{f4cs=vru%wq!N^Sx%3~hF3#B6I-7A6sWN9n-h0Z5biyPDRXu;Y}bI$}D- z=O(16O)#2}JQ3&m3KrV;0|*Sn-6i4@Tls`h9O@kQa{J{z$4yqpUz#5fof|36`5+tu zOPQCTSIzMYoj_Mk1Fq^Nt#-BPJLnJSENz-AJH*#;%=oq?xW0pwMBBrVyERx%FRwwg zSc_377YA}tS2X-}zf7jd5Ukw@UEG%?C?ho+ZXwiv=qr@=lyu}Q%1bIUun~w-cTSMNyJ3d)XO+M=2dmx(AV1q1tc1tV1 zb%grY(fAbeuZWIh9#oj*4Z(-^sr|R|w@gjb1;EZSu!m8S=EDtou@kAfn@<|(6lDp7 zxcHNTIiV}X{%t%ongb0#Gn+7|-|fgA}#!)kkW9dgQIeM`I2!SGJI_laHNBZu{7 z&x+M17ix!-W^Q8HjfhC|8?aCuf?ImOqA>bU5||uNhVjhA)PyONxC~mX;4EP`(OVeL zXOtj4`q%@D_>*UP&j)9|9IS1oBl~u6Fg5vN zkZqFoR9}dFMUwG6HS6eLdLCX&-tHq&d^jN{C+RG(vgcAFAwY>x5fUX|&gdR+X@il& zZdq~mw%KTa*L&Frf#b1&rF6pb%_dY>6_h}hLH14`F#&NJ)6PASPqkq4-f;Q7Ltpj~Mh6pu=+Lyj0=1GYa1Mbw z+c17H-Xzu}0%PVZoGlspzNzVmFcq&DVA+=J@sZ}utopc-k6RqmaHn17aZEjav0OI{ z-zKf$n&VUO2XVep2+@oG8B$~VaFOdc6JREno}Wey^&}~N(RdLNI*2nKef==h&`c>l zSuY6>Akr6$DNYpK2U(brm}DC)6sw_6n+f+PYaBr&(s8VJrYB%|{<-C&wI0HetGhx7 zaq9;IS1QV{(49NopXQL4_mvE7GhcfLKVAG-qghtsN zW?NqtC<$xeL1@On0d9ZZ3glRLt~=YeeP@bkK#-z5ozuiszA2HW07?Dm{x;Wa4*PlW`#>})`ZCEUr^A39a#5mUIz4&1uB zgV+(Dr@v7mCLw(z1A$ofgI0_`A4=><_A?A}Yuha=MMYdeIsbJy>>Uy{+aXtdoAspO z);t`Oh^_;9A@$!E(pGMcbZLky3}ZZy`{$An)yLQ#@=)^^qP^TX71Dh~nI;G=av&~p z$10aq!F7r<$VOLrI4qi2)-PYk5zhILBI&u?4f<#uW=b0q4=kix;HvY$Cw0w;ekW>| z-g7^G@vD=EW4~l@ zfwICLbCpmp|2nWsw>(_U>8|c(RV@C(FxtFho4mBUF;MvCM6sVkjj14{>tD3e?|1g( zgcfLiRQ4Y&QkrRC)Lht}G9SuPuqh^&>!nkXwmLpGv~)E=C2(+<7N|T7(R&@XF`hl*_4A!p%SMJ+kz?BAe0zbF zsDD>wu3>x9qKr)~^n7nXwC~K&AFp5$B<-wH?`ddN=D+b;iea*fMJs5)BvvH{cE}+x z^|+zmSl7ajK$P2p+QT5`31sQpsTw{$*bDSsHqw<*#J${SnmPnWU)$j{OaXhrs#im zX3~EM`{G6ArPcdXcWPY1$xN;@%C6Yl_?kC<@xA*6W^s!SzjJszFVpChJ}^>CLLBxyJR=DTv(Tbs6AsHWFNleM&nP{+Pp3V>(9+#N|BB8+h%y zJ1d#j%dJ>qF9i-^4y9N|a6O6(*Pp^;lz#0Ko)47Fie7m!2H8F1D7_ z8oA+OquR8Smlv)x*IDDYtV3MRoe$QcFj^`Zs*dLf+9g@jbE$_}KRSPuQYTK$x$qP^ zer3y)TP`$-9Vw-uQIfKOw_JQwej2N`u&1r@sd)MH*Zr&9?ITk|v<^Lv*+qXPa@=bB zg5y&D;TU%%^5D*s-Tb_RVtMrxna9h$nR8L6>OV}xJR#7hiCk)HD+;8%hH=6q#96xA zP+4W(%VW_pD!r!ccxmsmpME$Nm%mOdmTemHGTo2z54Ib&UzKOc-E%ZrmaDW`lC69y zpmYAoD$%IKB-IHHDa2#YVOT<+9MJo!*J$(^nrP(ZuUf3^o+B?`JDY0o1eZZQgVS?2 z(DWpqSm%ZRXi2=ShD%B9a#Q1s+hzSKjj1+d#dgZ`}!)aZw&Vyo$ z!De-@$=dB8sd0Ymq-RwIzT>=xe!#!&2SpD}+D)@dxm6zTy z`T*Bh*D!~KtnZf9$!sMK9SrY2Q&~8NJ=oUx-9#+p{v(B@aX0Uf4f}($QQKH# zv9XEBwmMu^dT?l>giv%$#qiny*rrVJ!xxi_M>-Jq)g14OxBip0>)jB{%=||h%sV;dn zBD>bn`7E2leto7Xim1i8*$V#ir0IpeR@Btx9J%%F=-hP_zgK4+k$uj$#w3S{*}OJV zOfOTrFhr7AP>>k)92b}&3lX}B&+IiPhNtK~m5FokS?fjZ=~5>u>g%Y^4#-D7sz*|R zSa^stCNNtFn;AE5buK$@Q1Pt2$d-;|d830ZUG5>Dch zo%uN1@|f4pczwK*fODocf6ZjT>f#25qpcB8fyMDhv0127IiXwDt{r!q9#tQC7gE2?b4a9d zC_()me#S0lf)%aTNHJeCfHv@1kVNKJwt3m5{37a~-+gS=q5PaMdD8PAITj*`?aAWL z-@M`Cuo%46uKXC=eR`bp>29WZ^8(%qZtO^!hePC7990;N19Zrkq#YIa{H!D!I$7KO zEV!gWM;0UGP7f-LAh+8L*w2Mwyh3KWg*QfPPZ6cLGLjv7tIU);taic*{Bji-Prd=`}9$!%yZKgIR$|pH5swI&>K8cBR$T5+5l+pof=vN%#L;n>AT{9m2^P zjBpv`p+Bh@);%X+W@3&Oku8ZEuln)`zqB0vU*izAa4l3r_J&^EgQ54Vk~Nm`Foe@) zlAV#$+bt19cKKa1NyD=uZ}dRAfyF3uw^MUhD>$NsoU3c*Qs_)Zo<)eH-5UdA;NB~- z`L+~0`z~8psui_(hY>c*bvGW_2YsYUZtEdKLY=jjrk&bf5#!B zsfBfac#MRa&=}VBg}^ATuBMUpsa|r zm5sKvy9JwJoPIMuhCW99w%8fO-aQZdek3Vs>n4V<%3O~Qsp{VGXgvx-bY2L?68yCc ze5t?dpOxXuj}u^UvI9s=U%v3MlT4bF>4POlqTlqww8y3zSS2c8ruRMDBfH~Mv0VMWt7qbP}eHqgRs zO!{Ppe0CrmOL}%Uj$Bgi30^W7rc9iD*~@nz7Khy+O?7zl6~%1*?RBsfAzmnk8f%$i z{h0en5BIIiz@G54iHaA*Y^2T14FNp%@#+Y3@I@-N(p&};mJ3a+tivoEa(#@_h@h9r zThZx_Z1fqs2m*I<`RB6;^Y`lB)nBlE-AHT3SKXWwX$rkrb+*PxS&MtEk_MAsgeSaT zaF4bt3Z_?!lX`RwyUKNoNBcL!)Ypa?k^bbOpUI-DE_K?VgO%~Sou2YDCpKaAE zq>B@5Py!!oa|t|(pO6cEExl(fuJ4Uvf$8BI`f?i8hni{$ygc$A1OH+-lI?{Qe!K*_ z4dO8p+4t{yqWXIDf*eBz+4}LKZcZUKh|kz^7CE6qhuihhq^m5!_oiyup+EevBUv>& z5W*`#RrQ7v>n_MaM})n+R8k*R?7cO12qYe(4|=}_jPE$spPy|j zS8fD)%(H)S#u8U1*ECxDz~8ge#-!aEVYNCW(WUpjlu_WOv$L4s?@bh)V<7G*)POYW z+@(4B1~E_H2|^cSlYklgc*BH?O-O^m@~NeY*1^8T$ptr@llV6F*c!2}ETj9b6e~K( zet1Z<{e;`WFnpva*_M$-i)7EoiX~GkZJ7e?4vkOab8M0vqc}9TDn_!ScVu%N=(FD~ zr`=>yynTv|zn7#8;)#6XDnt=|yMbH$qkNlQIpPgEx0-$R z$!pEw_59^jwM&7W=WCvmc%OP>Wv~hR`uXbV`0-I|%!{pt#04ar&u7&&xQovcWNu@# zwi$Pt#_Wz&3sSd3vR@U%>n9Plx$Ap4BtwpFMhBdyQJ@cMqtE-X?Wt@1*|$jPIP)5$ zNrrVG7JavgsHVTUrluGy<5V8T5}iy!6fUAUp-a1klFm*0hVeD8k;Dq{GelM)^AW;;S_#I=|AK<;I91>FNP=)KmcO}* zdXH2}9lvHoKMK!zl#d?r&+^?QtBP(q*OrQ(wiz;)DdFS*j+gj8=3V4r{C^7k)Z}rr zu3a-xk$Nty47P|MgD%lHO9Y=$Y@3r+cAMu#R8ZnzzYZ7npF3=RM>}l|(^`*p+=}b` z^rh>Lj~%+`q@R7xB|D#&0HF?8K>-iT-?-xMCGTSb(bm=gu&mA7u}b|wMi@w(Y-kn# zuah8t!?VbeVyi||6;g-#t;RB|lm>b%GEdQ_XoC2EJ1wd!F5+m%%c=BZF9>UqcAx(g z9=*VPRQEwBIxGVeS=nxiZ`TvdgjsLw=G}HZUu|=Or3I40{QHfv*=1~%+zMFF$Df4; zuuj)KUW}%a?tSh%3CZ4=zB(er*q??g-|{@Xi|*f+F%t?&Y+;P zYG)2nr14H>^ACQ!Gfr}(edbttAhUqke_r0^=s9H}BwJljLsqF*HKbxA?rR4kthRjH98 ziSn&6hn_nY7dBl|e?C7!H z%hBocp0nl%KbSn+Y7m3I*%Bc2GO{L+`_f02OY`xKQ16rIQ`y|;bkiFQnNR552#ejh ztA#u{7Pk_W-@H*<_fq&<&(ZVUav3VA|84;k*{jZ%vcI4`;zTRc!S@*3g32kqxW^di z9^uWv)c5itAt;&u`)NaTw67cc96E9p%8OiuwtYTwSSu;<#gtSv9}gwEHw3Aai&Z9w zrTr8H-LSMGhhj=-D-gzd%<)seU)YFGAjO?G{zUh^V>!j=pM8util(Za1bdhTKX)ze zdiK0^F0{&&n$3L**tY!v89(TpD6)#Vq0(dTl21nbh`59lM9@{z_Zb>av3stZ*-8mzyJC{|N7cUOjq?(Yr5^)1eG7`81a(pFP~t*!xu~s?Dq=lQ5!2~s z&*UuZGHcj&T0*e;6v1aykwB9}(N^lb&@oeCwh)wD?(@GrLX!T&OEp5G$$zgzQKFie z+1YTz+K-C!0&abZ6@s_fl0aPy=Mn!$x%B)HDh*v^-A44*cFdMP%oP7y1<=%ue`k|? zceSO@Dd<1ybf`duP$|Ym1&!n5ndMh8<_P3W{V|lrHxpT2fqyS=lf^hRwiN;<9lhAa zNB{LlBDFcKkkHkW8kM+c!mt{k#htafb3@XnLaMGrS=^^WCb>MK%D-S+w1m?tk9C|< zr0LZ(LN^D;HJ85TqP1*#h((IzgUWj-{_ovJT$*HJ3cm1weeU^f(0)1pR=FJ^S4D;V zdr^BNcMRQg0~x7qX&_g{XF)W+ilF_)!v5|0|JFF(L?-=-ItO+fhvmdYWeUOKxnN#R zkLpTEpFEan-wD8GhR=U?v#-gw9^U*f12XGIFzB3vj8iS? z7zZY;I7kK78GObGLoce!;ro*O9Ggp)kJbq#!N;m-Gec??*`8$*|6Ys_$nRnTobciQ zLq}$r^=6mRDQ&iB8_T{yZ*5`of9mxJZRZ3MgjDaAYAKe)%S9eB&^n*QX%1E7D=e}o ztxm3qDfqp%nW1FePAq&far8m=?^y!@$$fyJ14CO?T@H+_uA-})1T`SVZ*Sr6)prJc zkhdp|Ee-RcL{4TUNHIj3WKj6=a*zT6Aerd9rNBH{?W;Rf+7B$5@{Xq5!!vT{oB7|l zqZ|deP8*5M*887798RRQKu#nAt(Xx_^x)t$xLtp*K@wu5D4zxaakxR}G$Qo7kX?*c z@Bzt3K1tV(P`7Y^+4}9ZUIGr}j74RD)XY|QPDf;2-?fhI)q(ZGBXCk_`Dae0Cy9oeySQg?%$^WIAW2P+LoS9g-#dd$| zfsxpJs>|>lj`6oq!Jy{NukvC3Rzdz0!6aA|P?;*JhU}X!WQP_yqrU;)%AESswWsBh z=Kz4s7xBehnuG8-tVEmwADS``M?=(X)gEO zRll+Sf{mOr;*tF(p?tZC8(KRn&9N07NAGl~*I%uhZ)7+H+$MCmYuj}Ph? zI?l>scf5O-co;?A?n39hdK_h{=V@Ne=~@9OrSD{|bGNH}689hjW*|uPc3tS50y|Z! zyTLr7JUuPPJM;HE02h}JaE^SFUU(Vhn*B`UyA0sKrv32*pkPVHfH*QZ-Y@-%((bkp zDY0ShAs`ccWjav-*f20)RuwH?2y&M80&`!n8n`A8$x z_16gq;OQvCgPlqa%f5kz8BmJ#9Gs4_jk$J*jom)sx;9GbX1&K*&%^88ljwKHgbGu2 z?lLm|?y{&@$hPZ8wq5Av4xEfV4eSZX0iTLRRaIaMHVAw<)j)5s?*gJhxpIT2hq}|@ zm}vn(#U{;%Gu=rJGkvYiI<(T#u|^8jGeO367=W@f8zq28T}yG_iW~(Ldu;LBFTupb z+Ag!c7NG9?YI2gD;BLTWRT18NKIjR519qsUlfc6=-=E6!WwQHLwH+giUBj>4V0wiNpEuh-wX5cW zo`M6fg0&?ifeCoW*B$pz&^2~Az&h6-1Tzt~vKTE>{rLT6@0+#N)LEgLpR9>Qz5O=J zjJvaXAW#-ru8G&)Fxt1QJiuU1rL(hhcPiOjL>wj5AO!73N1LAR3Ob_6CG9#2dPk{# zjE={Fig|}(>kAlt#RMkYdFmZqs}`#ZJ}IKD_Y%#;ruj?M7KVfi+t?Z zxEpAq9}2pD&|k{$J?kZ@9V@p?lRNr1~cj-UaeH@fr5wR(A^S8z_9R<&Keev2Ox z3E!nW!~Q+hYTy;U zYuCB~#{B6^=7_DjJvsGu2$ah6IKAdzMDxpAB#_<f%V;;W4+kc3m!7^DeXa)%n{pM8TZ3B`;|~QD63Ko*xF2nO#krHp zUlzFH=RZo4Y2@iuf!wb!b{B!64&}17wJj4YyorMJ1xIrX1q+|-`BWZcR{*F6X(0_) zF>TwEow;4Mb8eHqXzZbc$KSp)>#9iQ5XT9NAl3+RPNJ72gV*!uw5RDgfUSiZj ztb%5??aA*1QzK*aJ5-QA8eqgn$MCyrg1#JV33y0{yp}3f5=sy@D>MmWL4kLYUgb0~`E& zLXhr8iJ?QfQ@R;CMY@}zdDrN5p4)To^L~FmJRi}$_w2p)+H3ve7Ytvkw{@?hg+##y z$4Ts_o7`{Mhu>IHFgB9knR=A?6sHApP5QN`ErLM30&5uZDAi}(nt&P9%XR2R=Y1ip zCIA>3X<_4O6%R)Nzgtr)8yE0nR+@W6@CBaeCx%pNR@r40pQD5l52wGzf4h7SHj`BE zE%z-p1&_;<3{7!$#k6=J(HWni``sCVYBF}N@@a>7>DwPV9)#dErs2PVb6}Ci(n#}r+76K75!iA5IDDIcXLSDQ zT4R9a#7%G0IJ6=yy0bZ7D*i4Gc4+%?%zU`}#i8mUFfEXxsK6S5=vLX-&f?sE7GfXA zZOM#*dH7|!liafA5zR_B02BnsJ_PG#vH&hq9X{|oi+^0z9eYSjPR>Zc`?FHc!mhjz z=*FoW9fD)qjZ>?6KtJo}srCB+Es=6b1IQssW;ZY&P0Sp;$?ehGtp{%ReONaF` z^mK=7w9w$oSrzx~3a>&=a>-`h|69Ex$5i&eegD~m-=PkLm79l5%($?T(S3pUL z)AD&`rRioC3JFjgvDEB*G=6{Yg-@nbG;>&`Y5!MGX|%bdd*u3W`Z=XO zv5Ahc&_Md%a&g8@BWFYj|A6yZrJ`#*8?WNaZ8OYLjNT$L5$!=U$?<( z6(=4&jRq@TAn3Xl*(}{~(3O*>P8VLntK;{s!J%zw_!F$vp2AH(2#zV|rVele-nNH4Vu$zQ(Ilh% z4$?JfgZxfkuNPzA{LkE>kZiUt)_4%p#58ez2-GE%Z5q$_1hX_@sNUm0xfuw+2_Dhn z2;{k{(;O+i{5|5%DZguKBB(WJ=K+mCz67p0^{Aq6$MGmrh)qpLNlMKj9t?76WCe5`e0H z@*&?21lb|X9$U;&<3J7Y-!h;li^E8Xi=o4Er6`}~(&2(Id>iRfZhEWb_uLAoICiO$wmFHa;>)Vx_f7C_Gn(V|9zIn zXAEyLKm55YqY)}5t^+x6gep1?u^pMF+{}CD#8J6$Vk)~4^YbaUq@p$*_c402gLfj4 zEFOZfm*cg7zKXiOjTCvN+!mj$#y(}n;OMRkHCO}QgF-!hL6Uo_AA@E|2;{*YaeIb( z3CuJvTE>^t$e%trtW37vp)?2;=E^I4>XOPleXtE z@HuLMr2kNFBA;RHj~}mWBMUzmPAX#6;n&RGej$>FgtuQml{o`OywR zwiD&sjFAHDNU}lFGPu)fugkk0evLALELY+KmJ0~iTYxm_#6+oCkMa=6oYHae+gi+G zUVK;&^7E4i4i}+Fa&C)=_2zEd(1r@@`N4DPkBfl3rwVwW5aMKo=dRlSHV+Mcm+qWt z2q?ArHe3Boo22lwzzVILx`g|Eq>mmuJ%3O!Efx;wYp;fL)exHRJdIBh0R2br$rQ>W zBA+(znUYw;2e2IonSvO|D6~+&?KT?+s{w!#_hK%3cHgN0q17;0RC15DXAAs^S(r!7 zJ}whYyR7Fe>Bah!kHVpf50AE{8QW$-Dy0@d^M(rJ*&Sbes;Dh5@nA~(sJX`CF%0`q z(!#B^VUSJUrfcTiV*Zlsc0>v#SiHN|L&Slqx#)nJOTYpbG-_{I8IfNt4RzTP;J z5S@)a1jV#JV>}FD(y7xdVg_p6NsH}vKo?L{JJmYaES^>OVe8S_K3y%pq7tBF!>-lExVh!X)Kf*Vlc-=j?aQb+lKeDnrb z$0d8o3rZh>f`Ow_<45$x^IhWQj9|mTDtj~gHVVgfo?q&P+raZkLg2&Jbaf60Pt<|} z^=p7KzTU0SMudF?tt{ms$P{?&PXUcA2RLHzwV~+4cA`Hxqv_;dYPp{*nt}_<@MzB; z)3K-(rEPHV@TdV^WLKrkih&_E2T&R>vO7ORS)Uz~{Zk8W>PHTFamQtXhIDq?V8-N< z1f!z`=S4O)=#*3|Z)`Lk)up_CG_1@Ee5$>2Z&iI0c50vx@Ny4kdjYcs+WMKGT*12? z>PN3pmU{|xv=?e8Aux_T5LNN4&&L(i9;rJ|XDQMu=WFr+FAQF1V7EhK--u59Spp4Z zJJcD?l1#?ukXs`@Puco99uD%?FMcb47+vFg(h&1>4Gc z8sAFcvMi8H>S0^wy#?+HYYDc!cW+>V3+ylTgh_lFe;I}Cv))3iLrCDgK{TV9DJNWH zdMcpZoO#_Sj!rSNe17*MYDZ@RJGv-p2mh|6k<|fR&skN~c}7Z$c+mpgGBhWKqLLla zkH9pPGeP8`0GMz~Yqv>=0r9Mu+T(oJmqD|lFkA}+m^*+y0_Qu;dz5N)wu8RJ+D?sZP*zN%kz1`eq&f1bVjX9wY}oB!Ks?EwRksC1 zE^{@$cYnCVbpt++<);qeG8ix*{J{XJ2j!}FN4)L!O$eLneJ;BSvTQ`h+~~ErOWG1%ZfP+HZhA3We{Pk!#*uyXdS+#X)pvhNTa@_ zP5L&$kUY=T57wWutQ>W_Kv2(0#>)D}N52-Bs_0#U7rxLt=qYq{t{8wDaC_}Kjl`#2 zF{d)(s)tp=)4!bB?^l8NTJ{a)>lJryr=7Q1vn5|)Dbf^Im%sNw0L+LTIoD;-b;3l! zAcqf3iIiCc&@w$c+SL2_)&~Ly zvJq5TO(-`Yjxm9qh8w+AEB8P9Jq_UzF6Rjl`gnH`@o_Xuv28Uw15CRw07Cb|iqgW0 z-=Gcym*n9%m0dJPa-Y|}kFbd9D?5)ImAtqNfM2@VPjpA?mahNXuW=CAuZT+FO4uH# zzR@q^yp2N<%w>7lWr0e--ZK%STgxA?=%(PSdF3+-Af(SC5mr3bWHrK>&!0NHHt)}7 zz-kVGM$iw%6R@FH(qz<13fdUT&>Z*3?gV49Cv=r2k2nuU7(fX$L+Skn@yO9Yw%i(n zfG7|OzTfn&I|CEJ4*2`MO$Vhy@ojUlnYCU5?jRL5G^(iiaN`TlS z0tm1QbKVAs!95>izSUhtwAPjl_DoOMM+q;Up`Yih&NpwNz_s>E(0vehKR?{!f}#pL z0lQ6a7<UcXD9 z$sk3e2BlBX#mRxI)@riLI5{Y8G~FHH;t~Efr0n7kK@eicc;>zNt~c24KmZ)C8LaI% zI5HUPK}*4qzpqfY`3cU)@1ilv`JfP`5EO8ybC;517scrd>lgqOHFRCMod7z1RjfYJ zm!td!O)mh`J<-C8rX6?g{E@-CDL~hGJ|Dngz>a0$4N{F|RiKyoy^<2nlt15Rok?JKZmK(@ro-A8== zLe?FbH zihg?WsO|^N=pmfVRnIY`5mS}6iR=InHzX9Wk?1%>Z*)8SZ3S({2j`{@H&7BNH>fDA z z%cpu=j!kgI5&bg=uXjukgfbyo%`x8-uK!1nZ(u|u1g|E$YUD6Ay&zW}Toq-yf&Mg- z4i2AD(+XT4d%RXGr zZS-xms8eF3uOr-jaRLOIoYFv*J5E#{=Ra>C)1PWsG;saACT^-CNwQ|=9axM(FZO@8 zk~mVw7Ot5}&z2AWh^ACPm};kn`4g2%hiPyyL@1(~g}-~AIVJ!7WQC^opeXu1I!B90 zRNH#Bqk3qJGEaFbzT3dt1pnz83?Wf09+WPU3osEA{A%&PW zTDj`Tr-iIBQ2f_^m0UU$0h(&LMG0T6Mu`6FGl_t55(;YdzZ;6*Ktu64450*U(qK;% z2qPJ$2r9TiV?gBZmf`QyU49L>R3Q>4XTNhlee~CzDzU{<63yRU4>bLY{}-Ag>syxr>IZa4 zD?f#dE>~L$2J^q+E(&F2{0`NpI^9=FK#(TJFeQ^tDT|{oc9^zjlwy>gM zSOednRi>j3Q%*oiX1nS>H&wVr8BjGTMICzO_CK^bdJUk=UT7t36ide->Fr15PsrGf z_3-7Lzqok-(lX&-X2^s>ksSyF$R9g{{@>twAj+3Q$MiR>93b=v)BnCBb`JmeA*q(& zRhg3)sZ039f*$@U=l{k~^dyWQ(t$dyK8kcYIWaZr-H2glhUp{6dwUGzek>Y|+Kx=>DljsXCO_`}5ZfWP^Gab%O#fs)MHL#D441(l=& zP&*4*>4)(QlY8A3zm$nK`oi}j!)#kJ=^d3-tr4ehW}eV^7*naWD zbS5v?#q+jtiPXq0_S@)v=<5&1<%a^W7nL?Pghw1{LjJyOQ>c4k5RnPJBx9C|F6p|F zyJcqkzyU+FMl!i54EjDj^e_?Y$=~Cha?+e8{|-aV-^OezgJoitg5fdbbW}FC?@P** zB!jYMpn8Ngx5nRRyt9&z9bTX**8s<|L*@c$MwUh8;bK>NeuvX+f4{kVWQc-1B2D_v zp$NWgHPq2JMazX7bLA-bwRJ1PVparE`aMI{}F&himi-;)ie zTJBpn{SR4|Fblp~a2t3(F(+Yr5b><`(BJ#tyHkYRhY2 z;_jH}M>DS_pKYsh*rp2x>ALyeHWgX!-Fv`cdv*fUxhBuu<^%VxunVF`a*XrJcl;UC zaIfa4x^{(36p>pB+tp&hD&)NO3j4hRqZ5Kke{iv}0J$gkBn~dBs*`s+PdhJYV=C)N zIx+h;vrhidK&t7yakARFPj-6wG@R@G@wa_UMof-P2&(C9t(w+N8oLV-*xTjY7S0 zP(RM$^0J!O;ka;u*CZtO27wo^`pE%eW>ZmmI7uRWXy(T&PDWWuwiiE96t-ScW2X@< zP{mT z?VkTDgqM!0s^DQn>4U&XWtN(UfqsCUIR*P+f}+2$sVdBrWW~8nz}Av6sg%}d+*HXF zC%l*dQVJsKaW7aM_H&|z4IpW#sxX@AgUaYYK5;>-Cs#5np$>!Y`ozJ7S z`$4B?OYw<|WB9$|j%}WPD4NX%<|s$GG-;pvV?g5$&9RDsYhhuT0uyKTu5W69hj>WA zmknyR+s`x^<}_YSt}QG7a6K&a(0L@`gvfkAN|V1jKXbIVd_mPuD7UMOdV6bISq zkn_JOn-%h?n{s^~6p`qfzu|gPi_HEc6)o9{*<~%edA`X#zJ%iV0h%o!1t(A&9^YLU zYy7m<@Tir2rHx4}Ouc9vAZ4HR@Fl$d$3FehZBLG*hWnm@Xft3%%tP6NB zld2uGHJzg(a8*+bq`jUX zji>~SNt17(pYva>P=4OUIUXiE<1SzedxapZ?97I}#5foWrOpz1eUpycTTF;u7R7gP z&wMkS;%%T-V%=2xVf$)}O@C-`(lVNg1W=ZrPG89dQQZw8dZ`MD!HKyWLpD`Sr(<=p zs0Xo1{)zcHvL`+VJz6s*h=TXY^w@nZ*E|E`LcFRz;3%M+lYLe8E_7Fngu{zU#z8jb z>rqeKQw)6l(zh~JS8KZx@Iml>+qK)6eWJ0KE6-1uFXG#dxZ>NZZ{h3(?60ZO}z9fany%=Jd|s`?QZ%7tGcZ|sm&oCUQsTAn-V@ifR|TDwI=AW03rL7bh@ zYn*z`=7euwaLMdqda(7wOZo+FiQ%kUYz<3MOc#s0Gl}20vz&pP=YpD4%&9t_U$0vl zRvrG>#k~K3LqYV3`p4&J(#r(;Dhv_R}H5;fMdm~ zBkNG)UxOrrdZF{LxAWBI$K3(1)8_Wd#|&tAteYqtgMjsG7e!U^+@-jxfEK6>u`01z zZ0WyP5%Xm(*}5=AXNLk#Ww`6KChpA7XJR^y6m&Ld+eiU z#Lo(&F#ld3A%K8swPH$OQfXH%A>$A=7p)Dk+ac3heESj%3H+)3+Vge2hn)R4N^iu{ z*gn1$@bty#V}lGd+%St*rr*BPE~|e1V5iyyY=05X$saqo zKJ;?Udqjg&73YeoJHP$k_NyLsh}Rep{;(4zi9K`e!|-F)-Ej z+0o+cvJ%o|M@PXcl!Ju^o5m=+ANr1h{(VjB<&^#hE7{zupEo$q4iGm$@!Guk`3t1r zz|;O|bG$2Zl=y&iV7||B^3S9tDm4lcCh3izGr=PrTu^`-383tUOE?;9+VyZ|t;2_K z+`kIDJv_tQ*`VDgY5a%Z^m{k%fix=pB-ZavT1?6_yYwc8Uy{w#*BiS@Pkqp7Jp5oq z2gIt*#&_p~Wt@TXPr-K&JBh)0NPqfjLzA~&-43Nbq`!S&QV-YX!SVIjJXNf z3mtAGd6Mg!8^omy27m->1An@g_5R&$q;vwE5pFo-COedb>Q=L-VGRE@K$Ek*nk?y1 zTW$z@YNMlXgMHo)w3uR8F|^*YIlOLtxH&Jec7jy;q7a29r<+^NSqC`)IZy1 zJGze78UT=lW~}C{cC2zF;hwEe$lU1Kq&?cj0MGsGE}3!6FA~F~@%{c6l9jh1A*_aM zxKtQn)EYkaY@FZE_^H89qPBCITeHxBM-jUcZR8sjSFBnUCkYQpsNHmaRR>s$x#(fq zd5VlGBTr+_uj!*5p4FQTg^oLMB_H>E^T?}BJe}}J12H00P{ceMrl(~O2 zQMz@}U(+c-6@hxlN1fz>!?tH%ROX^#cJwyE$j>)X8;eO?X7^uVX38)`7tXsXSmIJ@ zTq;arg1yn_g6m>dty-2f*O(yKzkdC{k2u* z!ToqkQH#+4L_YS97=9Q}E*jRKKNT0i=hPYSKeq{P0`=$r`W>DQi0(b~09$tNtcD)@ z8(18pbHb4^7!UBBnTcJNzEywL<%b4~m!kMc%4f z>1B9S7EGf}#bF1<p{|xni3*Vs~Ejq4>b~BayzgOPz*!%lF}YZCp-r>Gvs>aLNE8z;Bs-5e+q-DJ_t> zZfmESCvez5_tCdEo_}t{%E@VewFU5 z(EhS-BExm{mqk&0i}XvFizqc5l;{El-}^T`0%z~c1j<49)bH8yWjIREEq8H%0bn(i zE1-y9>jlxEJ zQ_=007AsTZ;ht&^piu^*_N;Z;xF>34`|48@+_rG_@!VMBaeuMEp}K{Ip^FU7u3q`n z2SA+jBbEAy;|5(@!=c_;kAO?N^TT;Bs@GFz8UQzuwRk198B_rn&qJQxIEJp8@moo^ z%vVQc^PCRkOO4l*DqDfR4`bJ8-Gzf`8GQq%!9S}X&1XM9o}=ZIyDA|EUtl1Y(5%7> z^;XsV7PK{6;1Ea&GSRVbD)k3pr9oLh{kQeW0|BcSlnrK4e_jv>rMdu|zACZFAlGdl zO4?R@+Cn_+`BaL3-Td?yPAMaoiZ)B-;{B6e{dgeHL+x|J2S?vduC79o=x`PeHJ#5$ z&h}Q-&N3oqjB+WMd0!l%&u8#y-~w;{dJArz8g=ZWm`5>p4J>sd3I&6#H~Jry0+p7b zk5bHcKlz?HN+j}*9KP7^jEw{HanynXW`GYIznUCiL zpSk+lq2u<}MRJ&4+S|I364R&K<#^=q+dyI_YT7yD+#?iubxy5CAYTXcB?hAHzrE^d$w1S1O^M^v`R8dp(Usl#~w&2qojFy zi@IZI#sO3(!6wF6Bu52EicIfckA-i}y07Pc$tCC?Cthht5!L&3aLSvSTAJ=?S<3jR zvADZdXexS3ny5&PRj}8k(#Ed?G|<^t(j|G;Q4KK6MVA!P^EaPx4-*E{7kvrrH)2k! z#qf+L)lT-ZKw#vs25`e}J-sgUOIT;ZBfZbM{Mp%|pYbbgmM4r`f2&xj5%3T(p4hRQjXO7Oz%PymUSfM7C?{bU~nJ%YD&0 zD_3s6{BUVQKXakh6&Zh+By6Xemc1hv)|Bg1^#Iv8hX-gNq_J@O?K;Y)~IvPZ)27U6wIv5$Jf^kmq^CbK!MD^2`xSKX}pr!h#yP`v4JPgNwHB zsb3MP!bLiL3{)DZo_8Oz-5vn)EUBNXrvQdtpH;}|)itAD&6%&B&wNRnPnw^{Eq{*X z{eHVlV}HUd>B7v=7xwafdpyfF_CHhb*Fji|s>4}bI1$~2MI37#T79;@5mPO$K30Fj zbt%))va$hoX!tvg>bbdM6HM%F}ZUih&u_(~eAs3y^jsx?@Cek8;#`+v zO|{OB_HIbB!m%bmrK%`7FxhQMEokf0*>Z2BvHaYFfnj3($b$e+SS;L|h*9#Qk4dqW z;h(R;d2b=s;GeG{;+dV_W)Xi$Ce^E>GYpg=*F3+U`sI%c&mf08oF(EyP<^idO{ zq>GOjRPHTYUF`SDyd|0n=2>-IPC#u{;M3&v$8np2_4`$uKQ-B1Mifg-Iw^*8w{H`X z#@SpkfnxCZN8t$$>y%RdL9c+g>T0Wsh)|fz+_aIa!b?%1e`a&a>+g@ZXRd~{RK7mA z&{%?x!q4+0f@gKz95dVd(~KN7l8Fs*!Z{zhE+$C|3?3k%+!A8XY6H|VQ8nemY3ubP zOtS(hEF#O8?|(E5kxKdUt6xd!#Fdczh6|Z zae7o5CGOupO^kn?E81A?%5q!qXF?_cGT4}Ve1`60!?#GR{SbRWn1`OwMlE6Log&X{?m$C4n_1w_<7;4vzP%tW7F^zD0NjmPFnLCC{;mM>fc zSjJ*Eo{&dvcXKe2k}l;Tsdg^5UKwpK1umNi%xPJpp@mq)jpSEP@^1SpoD`a2UP(rU zY|3Z3Q+HxghcC3pQwhGSyVY{lelv^rq(n_*b~})}W!V3K6bh|nyp4u72vwmYoPxNw za@0jG##05bhx_ljF{;0exWcPyNAih;F;Y3`wqeo>Z3k{_pC$SQjNrsJcH^nbI;lQD zLrbG@YuKbNI1&1(^4)Pj-;#58VLm(vQs8lsu#NP7qB?Va&bB|UZSQvF?)BcTQ@(DN z*s?_X=O^=PXk+;fiE+udD?L6~dao`eei!LkF3mYpsd|D(&<(lDAouNlKZ1098OC8* z>so(O(m{FY(FN&Fmxrh83kp`2I=f9M*3A-GVgzs0^H9g^W;-TcLwljlkgMX<1)G+c zSkuxKTMEq5@43pa?LV)-e-4eg;;Y4&WP_E5ZVNDgd2efoF)!RF@|^=Oic=da!VsjlRh| z{N(5j?zIZrXxS)Q@T$XPRx*5}J%=Ni?VDUbcjBF}b7$v00-Az{ecXw!f=4hL-^x+F zBR|MzMMHb7QI!wt7f?M>NiYy^g}Qb=UmW~>#tG6mIJW86^;x@n0;AK@j?|+Z*U)~- z0~76?G}}qb-TdbrL4_G{Awxa3R6?fN!_SVOv;&BvAJj`cIX-nW_ULG6tNG5GE~O)` zs^HRNiqO|h_NNP})lqfjijf#3r>rwdBFxs`CNcQ_s(u-CBxO=lp-D=u9$mKHeLjKTcL-j{A zG_*b{QSCJu>OiVOJ4eT4RM`v9!5sUc(nyM_Aa(R&S|N@ zYCCj>h+bEg^HEj;X4^E3jMMyQyi6y2Q696`Q3buWb{zu^O->~+VJZQS#e=Q2oE?h% z^K|Xe7TnMy%Y}Fuhgb9Nbc`h`>({pZuc4ti4NcXyP6`y%_Lsl>*TDiTxExUYcfY{F zX|DD8j2xe0`&>t{z|^T)Y8n0_*#_QQxdL?g&!s;bTX3C~zFUizDHEM{a~Gb$4!oKn zx{iiMCmuL4EVL$2;%nIw*DEoUv_>p=5xqK7@P((oPi?_51^IeA5_v)WM19N2CX?r9 zp;6tcfrg9Lthqfa$@}4IX7J$k+x=^$*^xoXy8)Jqarvl=Y+Y27Enm>`-K|xfK?aAI zRJwKFvE8}bP(tt}-a?ZKxkfBnV&2ue=fR&U7SS@FZKe_C4d2 z9W>=g$&9+y`3wrYce{>yNRCDDaP>qdn{>$V27)Zt8f<_Jjj@(w%6AjEMSo8*b;96 zM#z-Qgh34U_`A{1&y3R5GI^+h31E9I8bO1W^)REb;_U!au;#MH30(1BX+|7goWFeB z>)guIpspV4#K}|e$Rzq9{&Y|@g_m@MYjQajJtdb7ldm?jj6T8jB53ZJolZN%t~xET z)vd0^Fh?ggl6;B~fn5(_&)!zqJEdQysvobuE))IFqoPj8uSw6~*-e;#4{ii6p6g8) z$tNY$8IqFNUvTc@Q;yG;-;?qE>jW44O1-{+G z;Chb1a`yZ+)ZtN(`3N+Zw9^>IbQcu zL|vqfvJ4nj#kk)C1z#>Dj^kk7T8K|#9&pmK?Bo5Cc*X5@eUgipR6j%k^{Wb{aZk6? zCl8^16N-r&>9F;Yycr2BaWKkDzpgFE7PIgSa|>wMoX55?<9WZFFTp2JLgC^ZyF+BOI0y+e&})uF(s-NhLxcp)K- z7Ig+t6^D^`fs>PM+mejXiL5;U)jul&Qj^@5oWtaPNDP}fpV zd~IdumVJV!gl*1AOm;Hi&Gfh7C4a8aKOMLGpN4;4ld05&npiGrBFUZUDD)2?lt3p z(k0DGQo`rS&d%PgbGb*n4?=>R?)e&vx)Y;WGnKWGJhQg$M3jk@Z3~gASA89dArdN1 zNxJ*_2N<}x3!Y9!OP6}iE5Iz>>6PR~eSG<%+QKtcbF#Aw8fvPgV7}WMM#90%ndKWV zc{e_2;q(eoY~075z8b{0#ftgB>ctOg2f6|X4By(Bab4)s;48hr%k8zRLPrGTV$+nKL~VEn~mhMl~37xNp#w0=+I~=bR4-i*kB`<*roOX`7iW?#;p&70EXSRFll1K6 zL_lF!9W6+%wqdWVtaOUqZ)B`2iahTzzYle35kft@i39M zRQTn<8a5!WN5OO?Q;_Yv5;><)K1;t{vGcZ8Bt04qUgrp3mnQHqg9Ag8xdPyrV<_cQ z-FOL{lHxvoyzW|6AeEkFJr68;<)fLknh;NmATWM}{*=NJiX1d)xc=D%!T{xcyr`_M zZdz1a+yhXiFd$uEh>{oB7U{maL<;fAK9%VEhPZynZvh(&EkyzK`2pNtzUxa=)~~(f ztMiO*m<@2wfKa&|gbI|(=;u>%0RGPB?Eo45&9U$LJzx+v8KKGV_xzY53hGX?oxx9U z79G0SM{EE>02@kR!5G&1{&>#UP`iBOt_u@GlZ%HYLn(^cEI=T<4;U0@J2g&UA=fhR zftOtsHS{b$to*U6u4!Ry`*ZR~aQiluI`9H`dW$w~7YvjoVT1CInGjIG>NX#+Mz)JZ z(fB!}tpX!4Tn{a6SR99O5R(*ONR(!{gI8mgeVcdX;cV7(55jc6A7om=nd|pz5jlF0 z+r)&6>$ec&q4OpBA>oHIM(DrzTPxfPAcw1nwxJwVmh;k1FaIFE-W9vrqs_@^Qd*@r zh(pcUR`pT4(gMsSG+n~*MIU+I(C1ce3Ue1=+f`GXyC)f*Up*KK2ONbqe&m#`1FeF= z?kS*bLRD0vm>x}Fr6cZ4mYNwCWGTQVZ2JVv)ypl+yOJFqLlb$2-3w%d_<<>lx%H51 ze+f+S>@stuqHCn)HU;n4?pcV)?e9YKH}LuIxVy6?isg-_ZV#Nni?I#_ELI2h06E?% zJUm=Kb-f9qld)V2Fy2Rp;`g6KLTa}eycymsC)fsMY1dZq0L1o&8Oq{`Nn>yJ>T>kb zb?x9aQ+3VtGjaJiG$m4=6Q0^HW`ucEobDBtPm?<7=@<&dKU|;0FeB*1_P)y3v}zfj zYg;4y+U^{hal^iH|MHDX*a&LwY5 zwnj6ZawO>oFixyG&$^%G3E;_QOs~>8JSoS8)eN*A0|a$fx1&_}%0I6J)*ZukQb9UD|un2rWJK6gI?8bDbZD6)>G6qVir&%C|Mv89BPuWoRzuyIxzDIr`brirt@Ot zky))cui*d_1sO-_C)lhTveZKrHrSVjz|kVJRjx~k1!xQ=M&8baK~fBLF$0)F=wQ!t zC3Wfo*q|*S?)2Y(H44~No18`|qdA8@uKKqHw1v+6Xp>CpRbYcfTlH7&XQfe8vS7_o zmUM7%h(2%1whsh}8|OrZCq=I9zHN_un`&<;O?gbEc*3trVS>$rnm}^yiCQ0q=jyR- zIgrDZfo2N>r|ppp!*#+)@Xdt2NeQ&MsH1C?V?B3FXLoQpsBme6eK5Y*9Ae*RCap*< zSzS?Gs7V^N$LZtCo0KqITBXLHa!zF7|JGeI!+lVQu*Xw0Vzm`n;HUM#-zem?(dKU> zQFZ42C>PuBba#{4ylORo62(?B+);iLb;5lvAdvL&O-N3@<|MiYWrCZ8pBQO$-us2qbO+%R%gDc>Cil z#dD>SlmU8cu(cP9wx{oEc71*Zo%p`EH*@P{JppgZVlU!UnEUp<#Kf;i!J;?Wxa%ts zms-<(%Ga|<7T-TRUQ+d*vby?k_${~J zCb0gl{`b+ehBBmX_+`K{$-KWqn#5?=IpuVN@a%jN!vp#vh@a~XKNMd@8U|2SvUuu%x;Y$6v-C8+gUzAex9@36S z%XlFQ$Sdsug5sneVUN(&iEJ_WhNENQlW;TA(DV)_;a^Np-nH~(+3nOBGp*D`!TKPp ztS2sxpXmVyyf9sYubTk}1lG#KkLw>g8qQ9-m>=d(DRL(EuA8{vPnU_8vxm1=FRwyE z&(GhxX8V*wV4q$s=T$qDK?E+3=N)2!bPu+W%JNRmYd64ao`ZQ{fb}vAr+~lB0D}XX zcv{L}p3#;sogB}swn+$YA#?1y-Mly`W~|vpGPgvwcmX6J8&j)%U=9p8^obHGl<|8n90f=OsgJW9lFDP zIz=PU_xUdMhJH=Dgk$`mkDsfEZz2;^-EYmhDpQToShOz>Cw$%;1Tm;IBQ9sVVv1WJ zGS#}YnRN5Mu_3z0fD<{-vC6}Asc_f0q@d@bGdAy!cjk^{9Jg%Ox#q~e+!j23>*}?z z*TsS_G{P7)nAVh*_@ZhqP_Vo=JzmMgH#8?@FwI)=saUesV%1u9&PJ&!g9~_Sii<($ zLpx%@XvwswyHc=0`VYK$a($`P+RPUkD*}T9tkIIG+6DQR;2UU2R#cFSxmrqst`$fe z@~zwR?~@?5%evkp<`X;VpuFbPMXdDdmmr$B?;e0`Hn&&z-Ev5$R#MMG*>xE*rv#l_ zX_z0Fq&BA;u9|IrS5m72^p{lat{t!?yv~JG*SPUa&_)44nzKAJ{LFy&!`iJN3ogHgA>j|JNwXJcIAqi5%Dt1Xl zWf73-zr$%LED=E_qC>a_>z2ZF1IYtOvf4OjDB7N-XVD8Y0dl#t4J&ct6SM^~4pG0}m?w&aTDFgodVe8qmP|@o()p%9up#E}8Hj1{8eHEE$a0aNmwv;FB_Z zDX#QbR#9Xz6>{x0_fuzY=c}wbjgGZ=s>+da`H=3vGSU)WczPvl=mof5M_?*O)cyO! z%y<|a$%yA`q^kOnW$CMZa&v=Ob_~*0BLse;AS2G7SYBG}6-evx%xSH=`e1qE5ovg) zz;FcocaF}Jwkf-@izK${A=#8$Goud8XwT!nMjzCWOvU(Zi!EJ> zUd>NGy}|M(N++iN87PgGwTko|lYq@0l~JP|Q~Zj^7~RL-SJC1BS#$Wi&gJ=?G36vFI36m!yac}XXmLSD2E;O zmtitJ;Bx67!Qx0hG`D-g01|yDESm%@-ElN$-0ajkRny#Vs;}n9K=GvfvO(qSiVzn= z)!1l)?-wUBz$4P{pKJOLTMhCt@|oGV3vYGGXz?+FWQ5O+{BdL=NQmJO~TU zD~_s-NfL~Y4_UtH3vi#OoIP+7oeykH6m~>vxg2ze_cCP3Ls+Oe^h@_$d9~+Q; z(Q%u z5fI!3?IV*xEFI|QX+HnCl^~FjOb60yQX@e$ik@y`HU;0y=twVn_!Y$5U1!>8Sj z3Zr~%(A6aZUNQ&jNA&kEX14s}GtkiVSSLFuXZ+9>ZtrEDVKFcG}_E zzy4&P^#(_wp?zjKQo504mKd(N{R;f~gV;6ObEtGcqMh$9_;a@<$@Jmj!t-&$G3#o* z3!$Yg4e+m#UIN1}Ow}pJgC{Mb>kMe+9 zxIiJEK@jhzKP~3<0`c)epatI9ogm@a&^W_}DokfRCudTVnF%gp6S?)suYIt?^M(HQ z*iCR2Bmpm~PM%Be)}tq(&V77b;0K@J8S5K5MBtIUyd`1s9EuPACB4J5p>76`_~or} z(aI4%F>us|O|O__8yz{b*PSB9y|0b|prk#vNP~VoLu}j%{Jxj5&BL9TfBu;cc-b3Z zrGr~}@>G)G(GxYWm!hFb_zk|gB@qQyKeQJLDZs-%>?v57(Ly5Vpne?zVu9U`n-h^8 z-EK%ugz3nE0M|H2sf+#DDedyY@sis~bbXBFWXUe{iX-*`?!f_c(w{y7tkKk z%g5ad_XahH&f>KTWY3W4@cY^D+>@<}`Hl4?&`GzPNnk91XpH3kK3!eOAbER|I{2W6 zTAA_x+<~7Y;?%PK5ITA{q6c_`iXvyX;vqRDDqfXsad>^%MoNh zzq&Unsohy;vphMrZaK+lX~s34M^($v2)SlRwXNk7|3(VY?RLage0Ax_;jtpOKObux zNAK)B=d45ZzVM_-hGKtS%{W%)@`zg_WCOWU`n>D(B;`wYboDlfjnzj~d*XfTb+$jv z1Oq)FuZ$j(sj*Axxlz|{HY1CxOQS8$(U|QZ0WN-l_Y*pYYev09sxjMq_Ut+aMib6D z6jmQ%Ckv7j1ygmK*@f`4YX0@Ep3*FWeQ_G{nYcbQc15&x{BbFxd4tP(N~6& zDs}roYRJWCqgC6!SSG;5UA(#|upGBADz&r>Hd=)=Je3uMaWQGXQwgA=Vpi|ccIn8y z{CT6;dWOGv<764SiZ2ucm5Lf$53lJ`(4&XiV&bxDjN}z}X|X%xx^3BW82d=g#mNe? z7|lZVFhPr1>ZjfvFGla|)dMYcKH0u`84XVB z6sB;77mTw`&6k=|tpVSKGNRTghY_3#Da*4JZ#~tx^?ecruYAb6M+Ozn;tmB|JPzY( zgc$ui{?F7Rh=lhrs*D99trG5ipIc7uY4yfw78e64PS$zFz6byE1N}0KQt`B!z8t-@ z!9uI!ot=_Z*9~1R77ZhupgI+{vE%iExp;+4>CA?1Pz5UIZ()LxFZGrd2(iaUF;DNR z7Z-2ZUQOj`9J|&Ft~@-e)uFQae4(D6b#S)b6Y=r-S@9}eM3RQM&(wCT>*4T`t5TfZ zD`}yepy#k8><}hT0-ATOoBv;PXByPh6~%FCnQEg0Dpaj3b)h%`QiLEWyD%sQ5@rx9 zBM7l5S_*~&FT|LHXlqeXQdtE_2umOmmH=Um1dECy2n+!XiwUKyAuIwRF+f7EbXxnh z{nY#M%zbapyC44N_doOI+&=C=sG?81njIBXXP*rkYdvzYzIBAOMm*>-rDZ6EkQOl` zC*-WlGZ3?j1gK4u!vc4HX)(o+v%=CcCcK#DU*mm?ABQ@ovBh2dNCX%H*&f);7WrYIZ(eZfUI3IWcTcSg+N~R|4WfL3Bn)2*E z(z(X=4EM(5Gjmi!5B=iKugiBKp7i9Wn+I(C z9i=(1M)C-;v_UZ{3}S$+}qe!kbts@6I#S--?6=CL(Et&{LEp`hr%? z;e-zcY&QK4HFl79Xr<6gpBA3@HFZYioeIlhPCCG9#%>>a)mA61qVY;acKkEQpbRMX zQx%d4aT!z%aQblfIh8zN_f8Tw55~$E%`n$sEYN(iN^gCeWeB;WDTK*Y&H8y=CL=nR zrlaS?iFo@3a$$HgqAqiJm|~zE#E!w^*^7Tkd0M8YB{mUujb3VHZ1W_3HUp+gvk6XR zhradlNo}G%WjI{yrmBAC6wgeZa!-d*gU=Mc6rz8kZ)sQ}w6Ufp43#A3LbMwT-?p8k zK=b?$L^+hRiWMBzt47f35(_@@=Jd%{4y*L8JO)!kYB!}N?+8(HRZ?USDykOeWeLtN zzMi>@WRXKZEK@YLIxQx*{Z(={l)7puLBGyu5!lFf9I3^0f49jpM5WK&Re}6M-L|&D z(E=X^KB_$Xh!uIE#g^Dra#zG2x>?en*7+ceH!6~>LMG|>XSoMmH!{uYt1NJt^(8h3 zfuTbeB?_5xLJu?!INi6)2Sj4Qv3e_07ss_dE+NF`o*2cNz$Y&rzdFc_*;{4@--;LZ zX~|QkT!pG2RAdFQ_YP%SG2?@{3euIRn zvcqeE%l#=km;A-$N?*zxBysPex&}Y}jAUlf$Pf@_C%t84LPrYTI4{M(0mo~0;f^&~ z<6f5pTy(HVTX|EA14CD}M)kk+tnSbMa`r8lld%*t@_Y4_H>GDP+YjAa3D+Z?PxJmvYIj>xdpIBH>@Ug zLRz^2Rg}-S^v2l-x+ZR5ReDy7|EBA;e*zaAyG_?4_2cV4%%n`~{K)?89w(fUT`z-u zBtoWOH|Qk%^n{(o5&@--T5-;v&$*9K+pb3z~U^(^-| G_v0Toe_USx literal 57109 zcmb@tby!qi+cu0M0}4Y32n;16(%mtDq|zONG)OlBLy1F6NlQu!QWDbL-6`EI4Fe3! zxB1=A{XF;ce(&-9_5Cr&v5uM9d#`=1E7oZ>3&qco^)bVR{gCqPk2!y>x9|+#9|gM-8*R2*c=Pd!|8zfkE)# z!E5|C65_8la$aU8j(D+4Z0C%4#V&qb^m;zvw^Tb|;c)RZ#xc?M@8a-jgK{}{#l}a6 z4@=)}hxAqOX#&$29ljW7_}!M{l32{ZZkTWMvG?YIr_DpRbKY z(?Ng*^^Znd-g-Xmo{`1Srzg<}%p(#l{wa;WjJ#x)_tnV@CZ@0~2BvNa=;!N~#RkCa zWX0ji4+*S}jvZeGL^D4)$J&jpgazVDP&@#OK9;(lw}C$>1Ai45zI`qMSH?Pi!h;CF zrw4`)2_W7QA==+t=>r6S;nR?jWkQLDhhR~fj?j_iSn+`S>xl1fp#%&cy|}-w6nGq# z6EJ*5})WM=R~HaE6;sx)m}5=})d@7!zTT|4!tpSt7EzHuqyKINXppmW%g77CyD*VMm^;rf+hqTg`l(4{#>=P@<-E+ zzW9@Cb61p3tNZG}H*Q$@)=ob zs3>1x;n)FF?i52Ivp-w~D2rZ6d0%ZL!ki;rOHQa#c=O${6Vwd2EEPC} z9^Py_%QaeRD5m?&g82KT0ohzS)T__HJAh56-HXq5=J~Xy5oMgJ@naG;{b=aS3?Hnf z*mki&w+?PHrgMF?0EcDLUrqEPWZtP!570q9!vP0O_}AIZZ(ncY4@Es8EK7H2 zR1G=}aA~rhWa4yQf8%tr{`Om>9WFVaLLN;3UA>Xl%u=1(o(RO7$36&`k|{U%?aIyP zLyW0zrwlQ_#R0wgkHzz^S}u(`!-%<%AC8t=O%4|uEd+nxg_Ci2bbcWjbnR^LJT()Q zDBw0q@=yZX{{rKfOZ|F1#h+*e?@kO zjt{G}9DU(gYV4zL6L7Pge<;=3|}92>wO< z55wv4=6=Xy*;y<`t)irP*;0eXc^uIJ1_}M9+lx6I)1|%XQg#VR$xfTn##d1{e{4#5 z2SqMI?b?l+Z~oT*NoH#>KN|hTpqdqcIBLh+APp|cBKOuWjiHn88-`co&1|tZBVU}h zwYP7amwmSRYy&o}ciB?%#Qj^h$L7geTDu9kl6jq0pUi6$rfuNai9;=s{nKr4>ZC@M zYU|+^_A9Q!Ww7+u7t&IJPl(rlw_SEv9~mo^HIo>6t2Q7{`kikqLmp6YM{Y$5|HVIS zMm5y{v)ZWZlhTuPt#jRZLFKj2c{-?3VKKOVxjo3g6gN%L9W@+rTWAx|^{~kyS3WVJ z+kIS-KNNL)=tlnlF%R%RHduY9W$-dUuf5BleOXVp|{Z7vPwEb=R{>FS$j4n z_(1Pew0wP~CYo2KO1N#i6?QR!41EQKOn37)b$7^65qO1ZvMqKX`)b@ zy`4%vi6)8t%$9j~qJUO0!CFe>dOMp7SB%!kT9s<^ElDJ6A&44zd%1L?pEZw;C-NN3 zyIKQ1XJRe_QLm75cy8}#F6*x#&>$ilaZ~&>0z(;oce;0mWhT;|Lz?UYB@1Bw`R8>X z51102L<9^AwM!GWAX^lh^AH<@`j2+Uk=7V&x?yXDs!LM8%YzPYBq%3UCnI=P(4tgYmI)QY)zE+a|W zo%*#^HocAy?Q;HfSd1MdOG~_jaW;>xUz%tDds`w2ROjBqmLzICXH7YT-V#)}|9Rz> z(igdRCvxVrr2%w@f1kUbdZf_dZB8V++dMlp^7snk#*W2?|MU=IRor~E*>wisMpa(;q$i}DE~ zJh;W44)0Ft9@X;iFaBD08$U6qUU!>5DFRnIz5WYUNtNt;B!4Y^i(=+Q7$FhI&ME47 z3SaxR;CPbfS6dnK+b5? zsB`OtX|xDRA0&ptM=h{uNd%RBKsAkr=_i$! z-*H4C^$0_sL!ufyZZ1TPfu$LvWAmyox}(GLJH1xHw#(4_9SZw*wHGh(lb)_$H)Vk3 z$o*EqaJ-*d#Y|qq#tp?FY8eO(4p#PPV7ltUa{HS$L>38Y5GNOxTBYr*S_820aF!D% z8ofzZJN;{|}t`8HnQP{8)xo)9y=1i$PFYrT{S z!99u#Ui=tf)iUqdW_>hz`o%m+VIP}@d06B$eE(SMJ3mF^2v)JgHm-!oCkXndy%Iy& zvwd`Z_QuhGt_LNQO@unsbN#z5Q`L(w`DAbA?E|emQn;q^=J}fc^~C{mzd{nnYZ0J^ zk5Oc^L@K{`Sy*0c(D7dho}H=&XF7%bA|YPPX$e=+nym zLVUcyN%3tE^>{{aG?3vr8?(_7!76$M z_GyG;W;kLyJIeMIRtDjXrUSEc|1zz<6#w@p0X&Sc#iqOn0Y46(nZ)EpqP3?$!fBk} z9t|$&2xspTF?r4$>99HQ%|h3xF@Xai@IlKcV3awMA4Iwy#(~d+;KF8B^9G8u1w-OP zuz>l|T$P=TferJNsTy=fpcJ8$<((@RZ1h$L zJ261rETOFF0ve(w8gl`6`YnK}Au0MUD@qTdHkCiCW{rKNn00Rv;(Lo6|7^wFX6@uN zmWyi*?hPs^$8D3w!WuI|;LP&v zM~}p(#~2cW^^R89VQk^OB0fHxnhm7#Tqp3n(;&qj;1+F8X42$GV11BC3i(YC#82&k z{@INn@-wjfd^vCr4<1ntVQ|Y_;QA%&8lb#N|C+G+nfsc zM9bylihhP8A9Z8HX4GAM_^9@rEQN4WHQ+x2T0R5q>%Fn&$;qs}5O~z<(Wx@FFC^>< zSAv`fk1wCAjjdY*N-!5cuFQ`DZ446VZJvgBEC8u~*Qma4IlG&@F%EE|QW_jO1>V)3 z2MSaY3PvSy$E~0ANlEsiE{veIB%D)ENJY-}v$bnHPdCTHUt6oJZ{UcEM=>jiwE4Lj zxjN#I5{~4n($=yqD&-$hEZf2H+Vg|q`PJkM*e>DJZ$H14P9jhc2>&)2@U9Hv9j^Up zCFrg63}Ktg!~o+|P#{+?M~jwGGC|en+pH}}#n5vWc``>}ilk~uwhDA%r=L)h&n?gS z9;xTR9)cgAEt@`Pm&nDB{CpTX8L|dhOI`K~K3IE~wMD(0+Tz$gku~2(2nc&Pkn zYK%y5w@n&j_Y=r-%W$I`Eb>2fZiRZHCmaUPX*nDTx@HLBATZaKaq!zNCyy96d-u5Z z=jIf6;maF&sU7>00w9dVI3p;jA2Lb?=fc=-`8z*J%`4ZURn@4u`R=-qCEL)UfwJY|Yj9A~Z-2|% zOlEJ)^Mi*xPbaP|M}BsNKg@Wb?)HKjjM+fkx1ShTh6jzL>i*;ckEXqK$91$Ta)=(; zZn?yt##>Wyw3~Dw)A8GW(p`XhhS6kmUApxG?6SUg6Lk!RWj`J=UIhf`Nalv1KC7_cgv(+*mH2zM^tcE2 z{~JA!D1jg&PqY69WDzF6|FQL$z*W=Ys_EfPn`r%jo$>9gr^K7Z|A8KQc;VE)`Iuh5 zVwA|Ss!U%p|ER&qEhgw4l77f+@{~3x@8Hr98igP8;XmOBtE0#N8-M^>q5ps!Xbee_ zR2A7J5lUhCNY(j$N?-kV6$)~u=dRhN+oHT!4wU7<$d?kw{O(P;BkHx=h zVjAbnyi(-sB~bhYHWiC7CE=@=;YXCcU2*6;cq!Es({%p*`*ugqaI`&ztYG{zx~$KW zqS=JBcgnf$N@3oQyF+esC|%IkP8ae|UfVjad+D&hD(H+IJ$*sN9a;FLEW*)7a80Da zHqORjzk;O7f34acjxQGU-?+-M2b`LleU*UrAlm%>m$sV%=h3$-)JZ!#d6da7t1~86 zJ=aicfA}uQ-X&Z#^B`o}GbR6=*#ABldaGLfqm-~|etxUrnR20Pdy#guRS?2mDTtG)TNsu&SSW2!2rKYxs9I->puxLJ>H6{AccDjQ^?wX6T; zz>DZ5a2XrB?0OY6&8cuQ%^CU?TT}OJK07s~H8CvJA*Wbs*z0WRTq2@!!OHVP_(65J zQe=@8^1ZdLEX4b4q_DE&F)OtjrKFjge67YA@<+T~e8R_^?idH`%0IP7uDVE5EV9R{KJt(l*>0BR5uwu277(Tot>ART|5>Qzgn|d7l2<^UhefjJ&IC5*`N;y?{F+CG{ucjDg zZPN{rwCdv1UE(F6@#E-X`nV8o92~0s>fM)CYR=WQhvgrjf?1(o#kxi~D{M_XJmbnB z$eE6wpHt@%ZYhN#1D+0p++%q^Hon1a2RTyKLji@1SiddZ~sGXDv8&8pYka>jZOU zFY@?cgOAk>9U)(>Jo8$s8))9Nrl9{`Vhdd%evb++Rl-href<_^w-bg)d4^(_bRMgG z(M7z6XDTXxAW7z(%^6cPCT=S+v6kS*t+MXDClr97`4`F>(%uu=ih>rrJx)6FFXQH% z!x_saY&X<2H;vV6cs^bPMOaxq$@Zp=|3JC@B)l@Gvf+%%TYI4CPbu>SRM3Q_%`6-> z45=eZBl^Zrok+wvKd)tCrtwFJG3Z)hXptxwM;fiYe5#ecr-#xY=#6W6sle2 zhotei6%H(h3iAA5(GiPPPSZ=%F-h1M(N(Bn|Y~wXk-jXDqZ#W$rbx!AU zA^)?yt3v_dUF+Jz^$<4eE-DFR^ot-iP;NJFI`=wXC(Av)Od+9!<6G^?Y9Fc!n&%h{ z`gZr~73aw`sWy9EzX!cc6TJKznfG_tXf6(|ebMwJ0u@s0Gq_A{(NL~SHJ3_#_k3); z>p}img-8GQsn&4*9nU2Dae>905R3#KLHS6Q6#rJ`er|G(?Q{MFS{CEC1yi+%Q<^|9 z6n09<@9nchdwEuKb0@3*9Mg3e?oKCynDJ?}Z{C_pmZ?RNF+Gz4Kx}tlF95H!dZm~RhNtjpH`;tJ zHb$uabq0ySUQZOuEHc^YVY8er){`MCkDgCPb8K}~clGSS-;ZzhFJ&H>$`|bg|2{&I zc`L;Y4lMsTUqz9{E+SNf$WusZ zXRGGHt?(^0C<3Qcb&nL)@7znEyfZh|da+?EQjRH>6?SJ+gZr{bD)cGwq32P_1O7!= zd*sNkkuP{+$1S-`OMVQ#JF^OBDP%TYD(Z11RZc7Ai5Ihp8I>EV8XtKXGV2k)-Cd47 z)1o@EmBn0pIc`Zxo3H*fx{En>CyhOn^Rmn<8t z-GAk8i!Lz7aw7CZTP}X(=J#9epWJ1=8Ewl#?Xy*hgw?4j>hQin^5*=`U(65lXxPtA z42SKt*|{&3|C%_@Q5Kvf{2-Q&(0e0mfXs;DoRO}UVAFC3h`Iw zs>ZSNm>5%&F(J0O@qPpm<0{sk|4sSujPdDpm1;Nf1Tz#kBk7=i-1N6=Y_hbr>8yEr z!DA33VDT(@Z1kd~(t0X}&E$ML*+j3DzaxE{YjtU8SdB>9Q?uT*FSOWa=~+Mh6v?<& z*X(X)k@XujJXPU_I*(=4-$s=Nv$D16MIE6RAv7X@36OEoHV#a;ra)XoiXad^B-wTR zzoDcD5T*o0YjJgDRgDsB?L-o(lX*U>-JE?!n1MJEC|FM)`^`|nzFGbK9N8tA z0KFuV-bnlC=JU3r^shro-ggIVlor2U*^*4Q()qH)BahiN}zVu7Du3qRZRjlR7m_Ua@g)eSUgMJSiTtb zII{+@+@1;w&K+0>B&Ke5Em7J#d*;DO9U*QrTCQpxMDZ3#F%i2PuOf4(KXr#%H(?ru zs^W0-(#Wg642Co$T)a+&Yi(O2k7 z3X7*oAvKUqPz>uB&Xfl*N7WZh0z#Z86-hZqi(Py$h~LtAUMt??$5+1`zy2CuLY{Tz z_U;A!q0-QP@n4VULC8@CqnPC0tDk6C>R2jkRKsrXUn{X^V7YtX3+y=`<7xo2SUFto zFox7XwU{u4NcVLvifpMSUoJRPdq)>M`V}0I6XM}QzFGBRGapsju}s3O`m+4YEqCZ6 z)lQmVmfPsEI?)EvMqq2O3~MBza???BY~YuGLGGUmt1zen@Gz1QlcQ?#Am5ajGH*s! zAA=}5Mfsm@-sVT9gzYKGGK&mAxzJ^N#mU9=idErF-qye}CJDQ4Y z>SeY5VCZA)V}|3=nEf!>&{hh3hWp-OqM3h}me$ZLRoO!H(hDV1$MLQFcZMkm zRgW9563&(;KDrnkwTC4z2%WFc{}qF$)7`#aR~%h7G`N8^GU7%R*yN|rOrL`Fnh1ZF z-V~bnWw_hzaFu?LzWS!$Z{BfNBWL0;ooKt^O1W*QOF^Y=NCCNRoS8e&)N3m7XQE$``R3loaJ>63?-YWz0XsK_*?$zMAhXSM($LKHaL z$d0A8fdt($_i!w`MKmu&oVl>kQkN+P9wFM6qezdP9;~4>rt1ix8iJoZ?kFHgHR&(J zx>_YB-zo^T@T#ER4XhAk`0vuoxz+v6SqXhP;(`y6VG}O&fo~cal!{3i?;;p&m=6^x zsq_pfQbxNH(lyicXJqw7%BB-=(-wHtQ|CmL@0$59FdjrI*;z6F2mrNSLHo^XztH1_ z$Zt7Ov4zBbxt{HN=XSTGxq6oTc(Yu87=6o~wO1YPW9IC#Hls8%g6n z#yBvN^!#`nhM@=~cv-V&aJ($ScJgl)qMYSSV?`h_3xoB&pDb-2&elf@5X=M*Ki!W8 z9j1aD2)ZxNm>bH}e)Fq0K$G}rN0+^x!7AA)7HHHiN(nJRnw;ZN3fH>&b&g7@?}U1) zBKh5XFHia*+XL&0a6gN79;cO&XZ(b|u5tmk2#Pd_6?5r&?{bDd(bg2)&vL@O%jqxS z)<>3K#TkGLxksj&9A#=jeU-qESp&#ErjIgP->=7VIY^8Vi9WteAILA6oYJcNg#U+> zB&o6zxQQi^$m!2-`wpyl5C^6oAJ}&#t22znr@n7*>kKo3T^fFP65%>3rePlh}Q%F%yFddIdw{!K**38MlGoJea>2!~H}uD~jij1C+|kKAje99UNX}ibQ#&IJ>@Mw(4>kfC;A3rzX7|tWyxYM$g8EvSkxy2V?uy%!9_;q-O(aTFN4hZ+SI}` z9s2d~dFg4xv*BEFWJs4ar6A>IJQ;Gw%8H&7Pi&0CF0Tw(?TM4fkHEkt!A?0})f-Oh zEl&5MgLn_i%~orrE(o+IlwezcH_wnIontNOOBE7%8Twq}`gI-#sS6_7o4M)2#-xJ| zgXc=6X$<2-_Q`ZZ2l_9oonRYprt7}k^gBK$b&ZpR!0NsRtqD~a*Oirym-Fk3QPDX- zkQI7Qb9%c;9~J60rj}bDDG%%!rxp@L87m8rFktB}EY#Zdp3F?RKX70< z0(A?0MBAjAhJdKSVR4Ox8=R@S)a+~&3v{JqW2zS(CgE>pQBiN8UiwsckeKV^N9Cy& zN0~AQP0EkK*t6sv)EzGZ@RwqMtTyt5_vrjG=T)can8_(PjjL+3ljB%20;P_Vmbe21 z5Ku5mJ9CETU0PK9vgflZ4=g&D4K-`tZ0C3Pi^8nOl4+`fu`5lyDa$O@r)SmFOoZn? ziiCoeDvVl`*tYHDr|G~=tp(}PXV)&? z_Oez_C(C)IFrjlCHB-NypPiUebKG5X7Tg*)pj{MTZ}JL*{QM59&D$|ru-ebs^1Wd7 z#_{xU2k@v<!ThB|EMPLXQ799=fBGy9wL3^7prQRIL7P<1iZluY!G#j zms)qgCVy>lBGGYL=9dKLEif&r@z}jkf5^WT=tJB0dhK!@F3kWWir!j~8%_;*_UE8U z_AivvzEgZoQrgNZs!|&@I}*bvHe0p!mIsj@54(1(!np&2c%X8;usaMpxcY5ue!vl> zu6<8piL(`SW&3eg9mgl}Ts^R;f{6HGnOh1LVV@-n8jdIkREYn)Q0=0)KcQ7y!lOSl zhpnp)+l-wr{P+IkrOLAtvm>`?kR-C+Eb@5Bz1a!!} zdDPDAn0KIv&EK2~LJ(7Wl5;mZc02N#rmc@`ALBV8I69X{H9^i~ODW?vUGC%4K7A`D9yMmpanz|>z$GJ*YR4QRBp12xvZDKTq8NFfBspu zv3ThyNB)@U$BEx>)Vp&pvemzf`&&;hxgLb}D+a!WXWYbOa_rqz&ODPB|P}ewm6!L_N2NnP%2@`~W6YKUwIZf3Xyn!TZEm=lgSyn+< zA_~7QdyCPN3o%9IeHOI0xBa~xr@numu8EfQ*ug4MpiZ4eJh)CMi7dpF0}A4LsY zv5S$^IJ*9DD%tEuY0^GeqdU@i(0FUL&)QxyW zQ8_A~wr7)2$SC2Wof{nu7p2%{5O;C-S?giRzt+F}8mg!e>$QHk=`4)(y0v_A*|bmL z6P48-A9u=^MLiHTx`uz4Rka+S2u^m2s(uqau#p6baknlPnrgO{y-)z5vW?9aFkfPd z{RnJgfR_#x((8gnVXIDd$6eCz+0WY0_w>+s`tY^f>}_JgXSH_(N=z&69L1=}N)2W1 z*iDb{?92b_?RY+V`Z*m$OjB_0ItU7r1N$tKTo22Y5y}WuzKg2gr}Zk!BPjL?DTfV5 zyH+foWCrXyn&PM+A@^!LFFL?qHnl-cnD0jU8z>e_a+_<3Z9KU71OHImoN5qt%hyp#S7~NO76k=N6HI{#<$1E9N#0{Z& zM(sd6wxksI)d6h?RnO%Eg&=V@T;X_bDKiEE0ndbtC)O_ah=qX`4eixS9S@KZ&Od$T+B^s{`5}Fm>t8&19P>$(80gx@9GZu< z^zbMuJJOG&jsrAN)cv1pWuw?n2v+^g2hwRjgmQ+zFzJ1erz}O*Vb?B_zL3n2Xzt*- zi(yr{fg=J`otd+63FUH|qvI+GJm%Ld^&*RzhP{q2D7ZnTs|ODiI4nX`yZ7D##yOGg z8p-EU@agl9;IfrbWDi~I_WaZMLA)i0+d!#0 zu8)Zx?gVu_D4J`#L|cI-?O;wn-eNxA=XBY}0Q+wC;en`sPSXnDXlxn7Zk2$o zJKAGh+!B|)G=6IGX0El@zw7YrD{@PHn)abVUlr2ly0k=vvs#DsNu@$-Cpj;nc|4Uv zp}5A^k^%c3Pc~j#c87W{_Nc+;LH9s9{dVCS~4`MGj;hGz>>Xy&D3mS;}5H%Ap9SC3;Fy6EGRRul%xa4wH)ogP1Zm$dm5diyN` z{=K{uX?7&b9!jWfsQ&aA;jm2%y?3t{fP3|koQsNPi!reMP`nj$gd*!2SvJ|W8S+ZV z7`D2!@j5$lmhAS-GXG)V1?~yJi}C@zn-R~-)8wkjnpX}GeWE>?9} z&=bMevafDgaHDQp6O`W9o}{z}Q!(Fce_dW)c5G~;2M|^i5%y#7?9I)m9{KBb8sD3rTu5p$6|9+y+19F7{i>yPO@7M#hA4Qf4T%49N+H|u4^hoqu*n%?9H~7_(quT@z?wNZrm}&Mc zmy#N4zkFHEpF75Bp2tg>+$3Uill`wguoOg{nV74OZqK}phXz!1-#7je7^|WM=e|{& z4b!>d-X8-5PtOlLo`OZCRh3kIc@Zk7kzlpfNAnGziryNmdxDaZb8tZ-@ zp8%UDxr>%2t$lNGmIn$(71Ru%8J|dC$xiI$Bl8@5uUGZoSoT=neiH`*5~ZGFl=+z+ zlg_nT^G@(9kgkv6x?Wis6~)Q`5HfH0`6Mw0V>w_z3k4yim}sRVmkT67Na@968QHMq#rbZZC#m`WTk`YlB!qIUG7T>GOGS1a+6C!B>EfzYJMvBIGrpJi;koIn%J&~-33Kh zAN!MgFGzWJZpjpu$Wo``r*;1)u}35<8b<-uzVl2!l7HuXLAt$zre%2-JO>b*a;u3| z0b>!8A+k%^kh>PZtzyJH8qUshad(ItO5>xE!Eci1v=nrOb*$y*>rFmXCrX)|WK@+C zZ7^d1s7##f0&IG)`!i`U5L{OY8B7{5&gmcB8tQth)Asy0cO9vjZHB%CF@4nk%q;5X z*<{KV91d?k#{}KumEv#Sw1y4}@z&&M|6P0v+u9m9WUyclbCO@0KuOM>xumQu#%`S3 zsl=5MNx_v>sV2+!l(VD?R!`4A{Ofw1ufGnGE_F)h8)+06tBa?}^Zb`Mj!X%f>zW7J zT@&=Ko7OCjw8KkH=Rnd|YOi?^L>sEN7F}U4q_L@|GROd@NL6lKe%QLzT}okD2;9YH zJ7VlQQ`5}h?!th0uyTQOW8DzRwO-Wu-fk|F!H_Wol#wN3eex%h)Y7Z9M)t`b9xmV` zLz_)8x$+dF%pV=FGWmdDAr!7}evwxkPWZU@p!2Ve`FaNT;BDQCX6ISFPe&vb@j933 zQNg?hK3J6a6Qt>Tb^g?m`myIi6qn}cs`L=~pTzN?%1TUeS=H?E!G0o(mU^r8K;BFD*CQa{(P94&J_Pf2g((X%^H{BZ5 zF92+B#%?@C{7qyxG?0>(RwW`fYHN%Vdfqb`tBi~Q(n{0$yXmu8_L&mOgN+x56I+}_JRrqRS5bzY#DBItDnN?^c3vZd5)bj_Z}QQBZ*F6MWf>oT zeeYYYdAF#B{ON15CaygU1&w7*Q&nu+9^6z}Aqc!mL9-Gr&s?kQf&uCi;I>R?K%IGG zBndya<-Tl{}q!m4HVe(S59 zUA&+DJ>ttn@nNkcszXdtl;%LwRfBOxH&gK1WI;X0>jI4*!fcI&j0i`^;B?z=MT)X?TkWH})$2T`$ zT>T$E00>zxe`U5_`q>hTNn^x#;HmgC09=NLl@_@vJXS41b}d2B=&JH7Z2hts(AIX z%7SH&aVorst?JyjT_L5ZJZgq#S+bhC_CCL>Rt4Mn09ny_jp{qYj~zMi9UxoiNONCg zY`WSchsr~g*NflRYYm_D0IX-Jr6!-wKx~pTmW(5S(;fPEp?~VK_T1&P#yeM`^%zRE)@q5TgkXpYaES_XFp_w z`rKUXNGt9FoN)-Rbq@U&_E0*Md3ZG7G2!}2Up(_hdC9%L;-0stC+u#u*iei3TYq?F z)-3@)M9nbyI#_p|7~ChU9RDT4_g!yXz}pM?-@NE}9s(|6GKVXDCD+u1$^b;xn8K85@2@#&u*c7;Y7OAIIXk~U^P|lS0c896G2(O_ zQHe0ern{>xYNz8>>FS1}Pt|z+iR>g>Luum`FM(dmCaZ;J-ri=Bq%8uOyFZO3?6WHD- z?_9$A6Ifx+3E%$-m1h3mg*Kj6f2P5xzWr~8ycfdb1N1LZjN{SO3mi9ZB%7LmkYi3@IwJ_O7W~~4R(@cz-!Y{;bUP@;ISML2b4w0 zL5gp`;TBreUTvmyrZ_ZNvEcz)ka_2R)-kizm$XH>9>=TY`ei1aQh=6eu)MVSt}b7z zI9Z|lBk<;X7DrA+0L`^9Uo|^Ogo<)M`)%y27cXB*QF&Qf=6l>6RGZUEley-S7ZlMhpb-X9Pt#@F z9IeMH_YDVjF-YV&im>Wc@thJo2{;Tr7yeV@WIEo%Z!@J}PwhG52Pmip-G~9s{<$w= z1@lxSkwH~L-(`^3Q z-Mkc{p=llHXZo*{ld!6N$sd7C4NMz6EB0B8v8u0Yuq3@X6GTjhUw2KCl4?y>u>xz= z*K`ripwmOlug8!R?QwVh!OonaFN4R`fZge^-Te!2>{+h*Zf#)<9Vhz1b$*x0jO#lC6E2g!P>Uq!7qua_~YN(a5c;IeS7tL2^Dri#?+ANl>rBqfB3B z_$)h0l&Rf}Xl4nJrP;CVwJ5X^wTr(yAC)bU(E^B(EoCN@JdTc3fC=clJh|Jgk2s&9 z(Ma#wbrq=p$XV$Ud~|zrnYi`mk0}z+-E=o-g}BL@VnfG`29h|lR5B$t=g>*L0|;6j zvtE~bUD{tQwA5l+k}kLmL$lu3e=LbG3@Re^)~`Qa>+U~%Jd{aqbTmo|u+ZNqD8xnZ zx-A%O9@ebkl3hHy7gF6Tc3g@;IHIDm!|BkCnWxGQ&P5=i@|>!Jn#$ zntx_k9erSR8jRt#lcqMU4Oh?;s|?fgBp#};7XXWHcoDO`(-BQ_!XskP5f2Cm%;=dio1{nQ%sjb#qPk1;6?$O05Vz2U2epKrJ#W3|nq_kkTrP=L9GR ziwou3AB|%GYLLZSxp<~tRM{dRw|b#S5Bk;CqwjU_d+b-XJLw(`vh_(DkCF8QoTf$f zCqO<3N+0b9hj{}+1asvg;()$KfVgCn86Y5u_kbXZ9xnbq?FeH*T7Yg&!=UAYlN}lK zIcz0(MHVhYW_`Bb9+K5fxwqmG|5od}JM}c7kubvc38^uKX_}B*$Gu4I-Nyk-8PHPf z$aWut8P?M84BzWsR`3?;3wu*E_|aL3```IlR_Z*%G!k#ovyKlz0DHbhPjcc&&@l>m z*^G+sXJlk7f8Y;77ngjtWd)?f@|Dox*tj^e5b_ixs4T0&!E*I!oTtLD2BvZC$if3-J)({Gtw|(|S~^;e(l`}{ zg|pgKT)G4jUPWCt>sd_}iy?u!!{omOh6eV8S^of^(`m9mhaX1U>eyye`>DE@96EpuWZ&p5h8#L$d|x^3KBfkpao4%_H^*ONY-mF zaf`|rY~U9wUQW1_FSXe5%xKt6_+a-PdeawcFN3+IvH?9EBeW1wD7b5`#t{u)eeb?N zr#&z_wxPb(M-m1=a&nWKE4gb!DDnO5b6qTT2a_w=5b?)3uU$096*en$Ba+ z|K_0s$uHJ$T~!|2nGqaDi8b^*%J0WNse_+Nc%k)~e9U3?2Q8sEsUJ0s0PyFV@uuv# z{(p!$?h1mnIBQ~d=BeS0%5Ux@mA)2)`>B~+x2I=$v;MQ8Au9NlDv=r3fr8!`UftVJ zB~u8CTbv3bUkJ)k}=G6rq}|BNI=xF=0d~MONgb}XUU{c z*xG&GRV$s1;UMlfR7SPP!7$ZObM=;?C{1qdW@@N?z=4>W`G|qT zz6h|6Cgu3{HQtPD^z8jX68+Jup)^Fw;b5*&X#~19o;J~UGxO+jadeL z)KDOJDjWtK9saiY-fA`n#I*$ZfoHd?-OI|B@o)1;B_<#%fsp;| z%{-?^wIg1oZMj4O!#bE5>JdII;s$3@mpkL#EWy#FHitF#N259|S6ZAm%ZY}*U!4%T zttp_TKl$SAponTfbFI!Mc86Fyd8hZ9RwdBUe0sV4Qy~X81|;7azgPl%*LkkKM&(%I zn0CCNu?$%TKZNvs0%e$C=ye|2(Y?B*;p=YvaIwz*ZES|sF!0!xR73Y-cW+NUjy<0I z5=~lAv55=8q=D(#w@e%GP=+FtiX(H$_og_UTzk*tr%aq5>h^N5#xx-1IXLfeOUR2Z z>bbIH=?2zMWNlmGBHRN^3UQDI4?#FyY@{34lgZgitJv^i8D(~fK>Ry6D4ds~D+RCv z1kUW%x}Ejl?4+N)f6*;TGf62)?X0KW+k(i?9v}$^VmW>Glxp7wgtgIS8yxjozBdQ_ zv-L&I#Nv1&qGGG##%E58BR55P%@bhJNb)62o)|};k%|Z^?|}hyqnR>41;w4m4uA;N z9bY0S9fs5)^%X-i#YJl~as10kP9#bK$q*Rc{bM%0ymk|ENGE#u=>d9&*I^^Q7=2T_ zmeGt;x|6cbYg61CSr4)%cT0-phJpfFtE0DCzci7xtOIa|H1psrXLY3-Oa`WEw_K26 zAce>7RnTL}bhjawf%4BuEAC4^+Cks8gK+bnz_~|L@Hmu(tBhIs9hb!gZNi6g-o~y^ zL|KZr)R+#zFs8l2rBALRt)IG`&b5anJJT2aJ~ToAfLTVhS!0^~p6te)_Ea!PX&LEIHM(tJAny;w8E zCh%-u80gj=$~2LlxwF{W(h|;IoIalHdQ?Dzs#JGxUe{bu27({!eHm`UIkEL~H;Yq* z7dk02Ja1$`{)q2^L)^*3e<1{0@qFiAhC#bVd6dv7p!=g=Kqra^NGbH`;`s!5bka3) z7R^I@tRpv3{2WbPRv>DJpB!u$D|$>whU$Oe4t@rJw6G6baEI!9{yczeA3w@NE7mSe zPklO`A&v+a_470M{EH#=LxA1}pd7mg5{8)p7|@gEzaRB)1mpj_6OBI@+=MbMM4ZY2 zFo;3VTOian5&${#V_N_PqSovD2f;p(_}41~VBbIR&c7P@)^Mz=!r!{1kyNE)%j&gk z|LSW0jn4ej4&S2|JO{G`VA1%LlXIxJsuB^vX}~WzkmPvCn}>L zSQz-VI0GJHXi|u1EI5EN&6w=7({};WX8hN*!Gzs~%&0HI* z!Fp)HX|<=SMBscia`ZL5`F+_AP)C=yTtB29{(Kv@{)Zgdsup+on(2TE3e8yq`a3P- z?%lvng+~5VK`tOIa}jD9?OK56Cf`54axMPWOV3D0Eaj7k!CM1IP3}OHcNsG<$oCM4 zYSUw|=A`oO)~J4onV;7?Sk1z*JjWJ?6~sw-G{a5<(nu=DW_dDXY;EoMA_K z?I-k;62a8fkUW;NQjq(|`evn7!N#Z%<2fBIsTBF5Lh=|95u|%f40F0`sO2N1xnKD= zF)Ei7u5+@E{`!QI{NESjsshe3G?HpB3IubdjYfNs-|ovH8~V*kG6lRzC(f>-291LfcU3BRF^GE}c z4x-$#(Wz9Drc3pmWjvk#{yPwptt-7B+rBX`IwC912G8^+{sLm717)1-RCf+k*%zZ zSh0R(xWKCg9v7v4PPhB_$|GL?8D{*p9>zc$V@*nc)WP^nSf4<$eFQp<`j;Qp{=q6O zO>Z5yj}Fx>nEkG^i{=6nTbnR9phemEFMdvqo;>>nkOGVp=;!i{peM;7AKiRLZv^6W z%eFYPwevyFWf0H_BF~wwFRGGY`hTPf_zALj(&IGtx@NFp;fa)HXIMNH1Zm2i8mvii_w^E}eDk5w z&1Hqk)>`w21@{HFEm>s_SAX3PD}IHlcQ=t-hmQ7|7g1Cz^Axsx%z&v3moJEW(rcYy z_M$I2A zZwqikug1Ip0{8;AH;|PuGXNl^pUqrNStvTO+H#TuDS@=bep7Y0-YFhH zRM;-8GuY$XPyl+B^49Mh5fRZ-U$Zabs%M@wZg{xjRMm1L^6a-pm|xE;^dK%WH0fDy zsVZ%+pW3;+9*cRyS6iDzIWTXc4G2jL>dVm%5-a|_t0bJo?y9<#TcnK-_!8eb2cT+$ zPjc-1B;062+@}4UPQR*elbO??C~IJ{K-%HzKnOG zPHyt9;dXiuDSme;|KXrT5aJH*Cr%Y9`ko{Ju2ze$Of!A^wvBBCu)yYY1z5NW+gK*$ z&-@?wm9;zQ+l7ZY2zNGwu)J(5O@6pI*mCq^!Pa`sph+W{L)IJ)z3nuYZ5&{>IJjZA zw7r(G3}@&Ia`%IGeA0MZG9IkG!H;#qNrfy4-0sng)rPd9q0*K)+9@SWpH1`>XPmd1 z@H5S0#UVn%m@k8`93AJNfN~~PVjpm>F94i zwbu3Hd)7l^(|!HXzT2P5BuxM$`9as4JTkvTAH+AC7jQVPtU&r0(a+=g5k`*Z0hGir zvFC95$==nj(P19)Vct_7rhP7Ts<5Nb{oKgwt(J`uLnDFCg{oB4)3n~L6ijxjHo1x4 zdgFr>?gz$Y*q~-X%aRBhKF;**hlW5=ZES6Id7h;);7PtbeE6M4ZZ}0~eWr9x6CQ$& zF?rVz4bH{R*3;}|kIrfAS_UO4oOYazq~xF5n~}lBB3Y zQ0=nn-l0Ovn&qYMLNCzIb(h}Wy2;j(U_6GA&Ndv;Cm6&QsI?@i_NH&>EBB5M?ijs+ zTi#yfw!=ptodVh_dYH|#u{qK+FD(htyRAAWn`y$f8&}d3nqfxgm1u?qs2sB#AA(VD zI^b(E7Sz@rUp6pP0nO+=8_UCodB*t=&d0b9Z6ex|ZYH$RP^<3>!3n$TfVxMg#{~47 z#VL-YQ)L8hPv&0h>9(2(HSKf@=9lxnTm=h4FjqZ;%~hCa9;4oGvCImMpuYTbGG=eC z0V$SjiByeV?0)5s1%0&dUchHVfdQXN^mFc3+QL_Y$4p~w#%1EeIMjA{zvYtv?UMzTPPlOd$VIj_lWVOGQWr z?e=RMW*3baJWDj*4?#d~)mvYB@OB&A$1Q`NHU~~bI+Zw(U;A^!o*kZsCkD&uOG~qh zL!DnF1-MuYf>@#E;Voy3Ok3}LL6Em#%{`A%?)3+-tr24_L#$)${7_D7I}1{eFx5ks z@V-5r-AP~LeSIp-ixis&$~A{odT;c*54h7O=05AS4x6;ir*XaxvnlVyfy^~(`5wfy z3H(r-jKBX>uFzwe3A)Z7b4hSoI>mwcFe1F6ZTv1yS^E8^f>6vYGB@ct(sys6oIg-S z3f^c$-p$A9EpUlXMHR?=7?o`J(yHA3>Fk{Oei5;S!0hAsJa6`pL%RP=maZyOt}hGu zkr*kv{R(W_BETTR(43k)!vf&@lo-6H9P6cy_*cZ%=l+soXxR=oZr+;`G;Ycp+P%80?u!$GeEikWbMxp;+kEnCW|7t_&2smzw~{Q_2)A3s>i~pY zJYFgDb=&Z;G;0enjAVU;AA)J&(OFwsc4V@OACdwPtICX&F)jS7VnUO4PF!msT~gBD zM^auxU_Na#zJ5x0pK|pq(##mWR*58p21h3S{*gRnkNEE;16cMt`i9C{lJ#sHaOdk{ zaSSncp(4x^P~dlg>=G(pa58%<3I8Zl7L*i@Rh3WRz6vT_sDuw~UhrF=xe9~<1C$qZ z;`MNty4wsl~@X1vbRAl^9oZ)ma2wkv?S~?SI4xnJZ51ip**$G@NG)W@xl} z>3qCNo{n^DV!V&J`q8(4V`ORia3J2pd_%92Zs{B|tpH($?3n7rz!zkrQaxC_C4G<< ztb&Mg0ZSd$V@fM((T}9GRe?a2)&7X!6jxr{JH)kSo`1YS+cx~@S1CdfgVl}A)S{am^8oGSU)Dgc>0%m1f>&a`ruF8cDC#s=_SB@7 zIhfYqtOzwCI8I+L{{Z1Gg<9Mj?N~I#R;(1!EH%^^I0OB`8A8-pYLh&+;R|%_j3j=B zv%N3k--0bpX+% z=?pnYmo?F9t3z843^GaBYB5N>?PdPaC)69SPGc`U?CZ{2z)INa+(P1MW<_u;Z(SOF zAFRhSHs=9<9{s0nSZUfvEt-dzzGNo-1EZpNCE>k4X`rgI4VEJf6Zek5x>_A;|C+XdZorgD?OJgMshh;Z$fqqE}?Ftq*Wi8@Nk?* zgqY!MaL@HI7(_DmR}pN) z`k&Nm-dRo|nmyJi57F`1s!V5j{P)r!!?{?q3vr>ds1&R}KfcO$?oYew3!DM$!mUJGA3a^do~ob<6u5x-dxBaPNeMtFvT~GJ*yfWT-w$#96(7u z0yUCYnv1jSq>&Fyy#HLvcP3#pdg~sGkB18+QaTv@XOgL8^bsz(fDB}ZzzIawy~`>1 z)najO@xNP`fe~2DRr}rQqC0U$2ONAD3+Vqpel3k)?B9RdlBD^i_D6yKpljepm+REw zk3UM#{*EBlG5~QCQFP%_1(uxuj7Va>lvV^?Dbb?-m;sz2=g#`VMTYXtXApdy%l+Mu zsLWhq%b=RGU!loi{C`foOZ)uU?Nd^3*VHb5PhU3*#L?YQddkt@tJjuz(4u#?SfiFo80(N+A|F{+EU)!jyOEP_e*hU5l?Qe5d_ z|LRQ?C^D==>O;P60MJ?U1T09RrcTLo!+ybac z4;BQ}w8nIMMmfn>;=c#)m&yR-lZvQ&XD3?8=cyJs1+h~fi}`W=wYenz0`6Ck961tK zcbN0fXq4$i@4KLYwrw8RhzNRKPg$iO3}GEb&ogjc2Vv~Fno;wQfDK>m$hi)(|i2r==y=38X>b4BLJd{Lw?G=h3HAP+PgQO!f+_~&pY1pYW$S| z5hsufOooN~e(JOmentyuDI$AX$uF+0R)KiT7T77p8uLD8Dd5|F0!Cq zsmnV#W)oo(%~t?LTv-KT)Vb2{a))~cco932F6FLiCUl%_HZDUo*z)4e)6{`j*VzLT za?*Yma@Kr}88#@Q6~0lvKXw|aQ5Vcih~>P*)pQ0-)!`-(>aNJ90R;P>pg$+Y0H_Hp zT9R>Yp2>EhcF~77fZ{QEaOVHFNMVgsXYkWf8LlXAY^!X7dbP*eh*m+VR#^w880vce zE<1r({(T{!dqpgX_EYE)Nb5|4lCGetwq++&oiYETxtB5*bUf>Mx|~-pU|>=n9gz9# zj$qBC`n{R8pFw?C*DIEcqJ#n+Mp01WJ6#RLPC=C4nps)E<}OYe9((+gJ~8!BxzZV+ zO&-|0n1Aakk(C~a2*Xgs_uU@+8oTYVvJdl=$XV?v3Hz?lB~JH0h9rk`$x-~J9lk6j6^xj34@|wMBZ!Du*Q|E% zhXAz#li3oeMp$)g-JCITaDa640r^Z|^TQK$1%U%mOB?{BfPpMwPXMHpSrABXb>;2X z91DL#ai}3-i2P5V?bpLFx<7T_*_WWQo7rJUNRtN)i+?~#VEFt?Nv-4b!AP7bT4@F& zHf)_>&{_4gj0HO?=6~p!)HslW5?CT_WPGnV0Y1^vh?<7+mpGX4J4)uN5|qV(qj_0t zNGSEd3Jead(|uZ#O=TqDS-Am#EE4N4z78h9>YBC=?iy??leR=xI9f@NP7W7**1(<% zuCsozQf0RVIu|8Sjek*Xw?M|OeeALwb2TGnxqFt>v%VMLAx{^JZ>9-D_o`v(f`Z?U zeKU>iQTZO|VAd4LM?fR}UJZ~e8phw`-<#=vAO;znd%GC8_KjK9K!JK}O;Z5VGhh6! z0IS2uR-%_b0RqaQg8tNfV%gVVGEkt*fx1)uS1(O~%@cpIbgnA?<+|pvr}XZ#W%w?p zlI7~nowlVd;Eq{sDZjLCAA===lx8`g5JdFH($dl{KPujBd|n?bNFFaT^h=7O2FA4g z5pIpZAGPHp{Db3VBKs?-?LdksqO(p6y=JCK5CG-^f#K^`$D@QSxpI`Bkh00y`^v~j%v zkz+u?rg{SGF3Hc%hE)_gR^IPS0+7$+=G;UHyf*w3jHYw7HB)Ej02ai7kubM?$%4Ao zfL}7!G5&h@-H59CMu+-+0-cJasdAgK;%EG>r`Qb*4Xbt_yCWtg z7(&4QBR9_ej|{?|9C*7XY)8>=WV@3`WMs8@`_)2I5C6pc4ng&}u=7fk`kR6pGEmCH zA4!$0i*{3UHcJ~~ECC#k_kc4rHqA?^Z~&n4T8Y-8oCmr`=-Di*zo0WsIxdM(HHPfy7h7<#n??3VrMAW5DdbnIivEq_`~mBHM!jVC}JIuMXi z0{{j%07O<`6s`Is>J8u~#Q=oJf||i)yXd_>UX(WN0kgq``4RIFuychfXUGJ=Oc|pO zyg^D`#Of0aA}D{Cu(vsqV`6&80phvd|8(L>!BfQUn(0V( zl$5k|cvp^E-Xks)CF{Cv~X7gbxW|LkbGZ_m2F^(4KG9Kxd*#-l0s-~J`LLayw zrg~gotlqr6zAMta3&16%d@b&C{5v8s`|d*DUIISQC`=$f{6hYTB^_|Dw0_`uwmU;R z1neHCsI+JubQpB1*Ljc@s&txs(XiOfhd<3duelp^B@96b05sm~B>MzJtpZ>$TEQ_M zdP34g{YTa+>zk0`!RHn+uaEF=;MJ+pzhd90U!=~q8BYL{l=W!!6TeQtbs%s~+a-SA zBd_H+sy*-EFOXm6G@cy`$*EWwv5HGl=gjtXI^OV;y`E0-a7j|?{W8}5%VYP8qq*rS zaM?XQ{JA3LBf}WT;r=?-S?(s6awu~0Sgz0F%9Xgh8knyD$cUM$vpZ;W_T?9;jCCce zp`$8x+djhU__#3ux`RIhw-o3>`DCU`nC%2wmm1j`XZC+t*jti_A^oscMic*rYF?CUd>eFVX*` zNaFjF6WYulXWVrG+-JtdRIy1| zw;!e``X&F_G`Wt)xlwp*>^e|RvV*Xx$>eG&LrM5)NL<#R!pzOvlkD*1w$W?WCzzpJ z0tuu!dI<-W#pe5ciM*78grSINB+Vfv@PM{hYv6cGpPH6tGz;^pMxz%(!pM(uAQBqN zlruuY5rR+yR@3dMhcngnY?tMF!H<-KQ1iBL*7<{^(y@L*v_ygOeX*B;7bzde-ecSs z6BEW%yf4<`3vBZrizx~V2;P5*O|5*tSw~vx9y0&4W1V#Ay+Piil|-hk+3t+#l}GRQ zX6L}J0Cv1n*4B=zi4(WoWfo)dtB&BhjrJ}UTY^@K^cxQ@!I1GnOib(z_bhQ6tyV>$ z6EJ;WIfW9#xZS?K_y+Z-JhP|*tqk)&%glx(bjsTy06FbKfUrOT-%#lUtmBQOQ1c(K zN#KO>pT<(D=O@x?Yr+RX5Xww*HxLDW zW->jA>MN>;#$8*@q4;2jAE|p0lFoPq;HRe9Tu-0NBs%r2zzP8|?{E`es2p@!2EtvM z74iZvI)e>QhYk|w)8QkmYL+bPG1qhl{nlPFdJw^Vs*ijphx#@XzH{F5V(1R-PR3dC zsi|UpMNY}?YXa1|sL_HPNdf{Cnw^$fg${`@bLe%E1IMl zmQxT3`3~F32ThkWn9^+OuHR3}t29*B-$^~gtDldddo_qMb*FQ_>4`xdKAMntAG_>4 zm``(!)fHl78l37xG--vqhh zIP287IM6B-!=ZWX8>-&N#*z8RTz9i-VRYB=y5}}L`NV;WCKme@(;&;|mKTqsJi#5l z>mrMp4Iy!vnNKUPG3tsYX3m?*zSZsgoM3|eKUd~g-(R4FZCY0HS}Q*h7f-k?86N}Y!LZZV-pe4 zaE`IlOn7Q0-hpfW(2u{&>XK>$pyu;Pv4K?a2Ls<#D9;6hxe0_#pM1ZPV646_7ejXe zuldRa6BJzl`n8ntgN}z5?fP9wq#%@{4iu(2F5~Nq6C;0Im*O3$!G5?xnncJCfhi@x zqR@DSlS_8CqSW)NWWS^i+Y6CE&tCv5oeqfygFEw_tpe`M&{N?uQ!aMPaYX^)2569w z5QOv!FY1oC53KGYh0HtnsaTdwX=kcK%wulj{kx4+dI(XNFF{Q|5rSNc+Z4PgaGn5= zeAhnG@pw~0Q)G%CUaK3J%?6tr*3*={cJv~5p1jNibLycdp4-e@1HpAKKP4Yw6$ON9 zr2-v(v%>B9tG8OgxOZMLL%sYP+J;a^imefcn91%V&y5hi2W7r4SC4^F{b$yLuU`E{ zhLY%zFhz`I^jb>UN|sA=)H^PXCv%vqg!S{}qq7ei3w=@X$p~K{^?70JMymF!gl6$T zIC_rTnw3s$G+vlWl(LJm*1Y;wyD6et9;G}O9bbh?}g0py_joTK@De*Q?aB> z_aUTdkG_6We6)9$2HR(!QtOi2XkYc)Lzj29L`M7iuirs}xa(EXxfH}Kt1Pwz`LooAS2hd&gxKvY0|Tf5mI zY>n?0>>w2OgJ9bnQJ>doZ~^USLz@#~LP200BFuR|m|k7)mZ=y|F>nq~;l&+<^u?=V z;UBS~kKk;)toJcny>*<}HE@hc>(B}CPUB(*;h3+?=kDw=)|X{_%?C9|a$zOin-F$- zl;r&e=(^SrMP}Dn{8}smXQxz%1u+_uD-|xmn|4M5x6ko-@PdCn^KS_p&^x}UHH%!K zEYR2qgs0DFs_!k3_;;w=M;aDVjjBsL`Z`9!f4d%sz@##c=yJMK9P>ng5Df(fJ!~oM z3?EKP*ac>V>|@sQ<3;Q#QX_}nS=^7de)dC zX|jx{tO!5J`Y`R5z&(&$LV_5KJow6VhhQzNx(u!uSu|M3>?fvVI}(9HEDrJ$x^Egq z26k^5`q2aqao3E|LmDEmPQ2mQ!rtOuQwQI>6hR9AvVgP}yjvwDNsfjldX)sM zH9474JOWXkACP<%QL5AjBgHJ}0?qmq%>9p<_h7F5QzQA>Qv`S@X+aIa;}ShG_plI- z4AFVbz(SA>da9$`9SzqM;BG+5WZC(~s~c6|LwHmD;IW9qCtW7HaX|vO*bea;_s91q z=UpRMj!jMV>NEJ!(2|2ibhg1*+qs_DrYG;cK0kf%=8m&fa+KtWz*2b$L^vNwQ$!T5 z?)f3SKHui|kJkHld(pO18txdPW927R%@fqy?jyXmnNR3{B20eo+>ON0pBVp%V@*--FYj(Mg+xP-1aT{iTq-|1l#g>2#@tml>beb#%M`Q=6#QRPYF zRra_g{Hj}d-*@zWr^3D{^tYDoal}M~Yr9rYXk`oB8Jvh?hmR4W7b|Gc2ZnNi*ZPJ& zH&X}~DVxmTgAJwmar0Wo4|DLw*!l|4yneh!X1xzM?XGO?gIlk3EA2A)1V)P?)n&K7 zD8X3uP~>hFLqT1<6R?V>oykH+@fKZhYPTTPQs?s2^BlmbKY_4amijZ3j>lZTZgflSFwK{QBa~mK`8|W zLQ&E517A~8rbea=VxD)O6nN|ih|q^}CJF7Ht$l3(UY8$}Q$?h!5x*${1git}Sbm8> zc$+74up8|ja~*F70UNX-Ole=`B^9iiEf5GDukPwopS~r2#P`qBz$)eXePIiozxyGO zYP~uB>j2M6QvQA3rzvQ#Nf7%AOoEI2LO#EME~{U|+kDoa!y8P8FcpQ3`~MjOU?PbK zA;Hyr8x@qACbHS}1By5`WbUgl+^Ycvm{-V4j^yPou zW9+CRpaA0h!AD`Q{&5Ze^}(;nxM4Wwhbt}m4|gN) zDSxmLw)i#eFqTWpFFhL1r&5!h{r$Jti431uPb2Yg<=jGzPZu*(Kxg~D{6faxc}yKE zfUz8Rpd&5BjtbAmMWo;lTrG{sG3}A=))VDM6uc;Apt{LZSBha`3LeU(@u6^}B=!HN zT{D@@yH+8AFi}}OQn1dS@H$$F0Y2jQ3$s? z-28(^wRsW&t+{h6AM3#(&#jLx*IY|WAbwX=w2sSn4Xspf4dtp54du+>a1xA_F3LVb zeW!}JXs>8phbRr(lM%bA<3ksvmoF!4n;HjiB*gHxgkE$-LD~=m{3%_!L|!{FODCwlVWr^Y zaAmQEyZw62htNMb0VWiCPwbj^)lyD5#ct==0t%PeF;dyKak087C9&6fqL_(%lj9UW zQB)yy`L9@QZ?<0*x$f)b;NsQHAbMU%6O^u~>zG+$QRP*izBbc}QOyNS6xU30l}`|K zR;FXnnUPMwn)^{ z^Gy33zowPxlGc0}gOJ2ck&Z@-%WB5$ym^v$FB0LMY9i07hhCvgop$CenA0`1O1_nP z9`}Ak1ABf&t9{e6D!vM&ZEk%?+QK$Ef3=5f!avEmAj7G_s`?~nIU{8`1$EOgii(AAr2vgU9F$M;w~RfC2+@9 z%iYy&&eP*k?xwEPM8Wp7{x10y!Wy!Dko_yLS=OX8E-ghg2%X)`ZQsrE0QzE5T%70HOt%>(C0D|@9IMQyoO+fR8$*+0AV&gO@U!#@ZVRmtjHk!n+k z5w*4bao}~-u;*|8VvqNJ&Yy|dG^R6E)?`a++KRZovgqWs(K~bJX7vehg`Ggx)dmcc!%)(bxk97x6sks%Hty+MMeY(H7k=ZTqriFXG zO{mj#8<<2e9-CeA*k4OFl8U?tHf(sd6o4o&g(e`d^_1s5#FL6+v{M%e?VhX7IM~PK z5J3UC5LH37zg}tVz6Uzn`+Zk(?+0^;V|SAZHQJmusVAhA<~pqR!*QJ^=WRpMIVN9{ zKX7)oYeL}e6-rVuhajH&97VU8_{;YrpLqXg(XpFj%%3nCPsCYP?yqwiGGZvWRbnF3kZWUUYgjd&ClrpSlb91^NA(Cgvx=&JtVO#>L2}+Jo9z;+O+0L62|7=jFAmQJE zqBfz!nI{&Y%t_De1%gaN87O;c^IOn%9n=27tQU5!P62>91l)CfD%V=5hzH|U*j}(i zC_nK6qa9rXd_<69^boIOpbtDsdSV1Rg6G`&080r9T3~nM>)}dQZ?z1MJ=V*!E=o2i zo$K+>_oageu4$NSMFqoF@;1OTZ2uLowOe0i|2#Q=R3V{EX!T8E_mdkC%}ZM%JTX#v5iZGS;8Q6tOqn)0t?>5n&%>g(8)5yRLsK5^N_SFR+-3 zw+Y0eRqwn#z`n!0b2{$s{y4E|C@UzD)FXD3YrIg{(wQf30LjYb%GQi$;Dx&}$Pvlk zF~$o)@Sa|86ZG7S>=wmmO$sE52yJZ(D79F*U9XhTnH(B$^>CTu=#olnb6@*#SFH+Y zMbKZg>@$IXC%2PK>YhRXSfu~HsyR~~5|jy&OVX_^iPfdBf{>4x_r*2e68O?nQO^%W!9jgMCfMzEkuvFdj9bIn|CSt6IV0nuE8d~}6);~s)E z!YfGK4b7>-m&o0lL9F0(RMM&&7yR8HuHV*dzJuEDnEE%3pU8>C`HkwozA9XqFv-gz zV7V`qj_=mZ^{mN(!8=|EN_u_=5|(vOh?KSPGD0E!ZtHEkLi54YP)Z`H+o#jx#UEYE z%-c14wRmn)s}AP1`igWeU0yB*`d~8(kk&ZS2=zrH;DPEATt%Dd+xMTA&$1*52~l!d zN21YR6`+*j7;s1GN%KJ6s(x)`}dNWggtx*isLzt zFAS61y~F2^0yjrin1>?+k!Tx){y}Qx*k-L^#pq3B;uuG9re&Y^iy== z^`++26d{rf(w;CmJjDGZ)){Ok)@`VlEmKbD#z(CvLD=X#RbKi6-;G`6QAw?{{kfRP zH9}Q`x`O-mp46w57wwRXlNA$JRDpa%=b81f_U3>?)}+<>Vz4(@KjVZ(p`dU-A<|Re zsq(7~fU5gisXLdkrMpejztV98C!mH0P$d+dE9(u(?sHaTE)8R?922VF2;A^!n|t88 zQ6UqWp^Ul=J()8JuB=dhc1bQ&mn@@oiR=Z*J;q?(K_jQ$o9n^p#)_n?Ek0G-H5yb0dD!B-{7;hYqI^bP=VQ9>eCA}_5xK`t8uhfb zLGh*M{_%Z?w))IlNE?r7|GNE>sHtWBJ95%vW}=ID%N6#)y!bJT`yb=lxRBBfPl+zE z9>&FdA0tnqTVKQI{u4h7Zsb#;Pkv(39bUZpECidGwXQ_KBt7;#t=rv5G{t)Vdw&EA zbXI3|rC_<6+dm#q%8CN8&GkUd2NTLsH#V!Ix9ay6bWiPP$Cr*WwZYi8CsVDpKgfXv z(J;0t?B>IdyVs&8Nc zgv+`LS(DLoT+Ri4%-hoQkMWU>N@P6FB}G>6|51Ab!uSH2?GYulw?{c@7%FLkn8j0> zHD+uXdg=c0!|L5R+HxLA99)GkRO279>t%La7+W~bbV&+hSKm=iWK8nftg5<){Q9-` zRl;~Nx zai0Aid|zgdlU{1poXSYEXSbK66U21%QH!IzX5GH8d}Lic`OOrV=-FwqRtf0+TRISw z*`8ISYfn~Cdq>?8o4cZ8S}KgXlgseN!B8>uBUn)> zMEZSAR)FD2dL~DTS>u#jO6LLHB0F6R3wD6ZRE^+Uoz8-7!_c!n&`THpQR*Azao{id ziy7r;o5uL`E#~Ex5N6G~Zd!6gpbnQrcjYxt+92#?W<72{GL$IbSgVe-Lsq7{Pm~^u|~u`|8(=H5EL%O%}pbGaiv|Q`rU=A^XBP` zF@GrY9-fgB6Uy&R4_lpf%~3*bobR&^FLRsixn(Zvpk6>0nLZ;8tSG91qgjrs4Nj#x zqry02>MV_2R+6a1>K3UrI{}_<7=DJ>tF<)NT6+&_UZ1yg_nrk#I=+WF7T@^a|sv439c2d|No(nx?)~ZhqL3v#e z=lXs`n5PC39a!v*LD-R5MuM~F1bTW&@8)9C-h6o<#;t4m>$dp^k8S) zqrlW@`hVZ{Y$&5H?$M?;z45WBNxE>c<}}UcpvR963}=}Kr~j|L9QnGc6Yp~3Y)(JD zHgUys972UDih%#>1%*-c!!mX#)D(`>2 zYyZ{7vbzx`5Q6T7gEMRM{p*+shYX-!R%-sML1&Uk4RIs8))NuQNsAuP{_$Qpl$Ws1 zWc%NpiQP6wn2!vS9?ybx!s`DK&2$_foZuP6^*I41;JREN6i|HjD1M)kJls$`Wg{* zZck`#_^%Faq?)K}T|lE6h(fb4Ia+Ww{V246F^y%+XD$@HSwS!@3WxQQ7XJ(DS?HEH zkfbGC3aK!hqf}GMvDz%a=JM-9S?J=GIu+5ja!bDsplm(c)bMOMUNR5%z8}EmbJ+WC zTx#?A^heGY)#hBucml?sAE6XexnNz7M=tX5u2xq?rrG`s=$gNMOJ;p+B$l}?o-0Ux z6eLqVq7k^Ap`5QOS-?6kpM?G{5NhB+mi3Wk|NKw&cFwTZu)$yn3G`^;2Q@cd?&E~{U0BSLLCS|!YbaYeANurX0orC8#E~K z{vRJFfh22X$HtC)uk^om{!o-3kMn;$^vLjcNK5W}$DrlGmJqK}=@FVBFw~262Qvom zfT4F;6=re#-&b0$Cilp8$tCC&@t1&eqZck;u3__|rbWO8+|Ph@qq*3rDxteQIkI3S z^meRoZ=myF=9iwLiSKGygZ4ZmghA3=ruAKR)B^BFv0W6p# zb|0-a-pB3e8#`>`BY{nP%=VK0%{s&_)srK#D!RFavFB1)m}gCXWG)9>%tsy3Y6O{M z!)GgRz^M6y#uN3%7maY4BTVq7BI%j3vhthB)t>m1X9Wp74l-_s9h4V~L}|dH&Y;;J z^TMFcc{R=?k{WYstl&(F*W;=}OF+Nze%OlMLZc6h6@WetkANEtU4HF_xfQ)arhFnz zwt9gzF(V(JRtdnQwkL`ubw^TP6&t4lo5G95ufR34rs}N~tjHV_*13*fy~D5(Tsz%r zJ{F!4KOQ`2z39jY>^#Yo$A6Q~{R;Nl(E!uC3$QUd|DYQ?6qv@OQ2{!uma7dZ9pu`d z0gBsTy+7s9vj$P{6Hv8IF)=Y=AejN+A{Xnu;0#)W@dx+k9LBUSHXj357GR5p1)4H~ zYk~}V`ld)F=(Nq^u=ht9!P8^K&Y<#soDD`6rVV4yk-lX)F`UuQz>8!MN1sr1$XflN`ir673y43IchQ)D4R z1XoIF5*==XgqY0C%yxpB<<=(}`TNHEMGUnH%F3T%Ol#@1<~X{*lqc3Qmd8>Ys|f)D zQRWq(oeco%W&qOtqldtJu48E;#z+GEucbKv>!jt?NAO5QYPG*Hg8o!-AJ(6c>H@$Z zU-akO3+g}(4Gl9I_CQJ_3bYouTn2z16GSE+*p49F@DaY{1!K*#5E|KYAjFfoHsrkg zPId#F#5!(O+mX8|Wvza`-^so91RkMwf_EtJB?{kd45mti5ZKKy)?E$0K5Hek!6xQ? zj}4E4{b}ufgv~EvF%>Wpi-7AbE-Qk=3fmd5_vm3CYqxK6Y;bI>rBTnPI}36LOtR`6 z-&cT6C34#6FdKKrq7sq=!4>^z2Ea&h3S4S{l7@M8z7hBUaMwWVnW_m+TwonlebuYC zF_^K~Nn3yocqtsfMSpfa1DNgWtwl2t;QF{?<8R->z7K;Om-OevgC64moVr*NLeFoy zFRIdB{@?^2yvH$m*sETOumqw9l+cbMvD@W%CN-gx)}}MG!H-$Y|KZIounvXo@vRTd zuQ`3)RdRbWrke^ZMGauyeZ_!(YsK-^#a{#z4<c z1w6(#lfXn%9GIB4Byig=o8YOFt3{h>cF+3DHo1nQN6CuY4U=k!~M0-#t@O_JT!tO1d2uqSxA&AKEV z5IQsOc|0JQYS0pB0DL|VSy4TKubi0QL*iwVZS?buL*O|^wDohZ7As33`7sQ9By!a6 zvtjetkH3fKwoU3Bl0^s&I#N}*gQ4aXT8;3-iaIpBaWR5y6X?@7A2@2YE4JJ664!D(PSS<)v9~ zVSx=z4vII-A?@2k)GsshHWNR@RmXj^RdXoHN6>D-`$b0h;7$q`Y}a(Qopo*B1u;ez zkjyx)-}-9%cFz5*<5qwFWDg!vAoB4Go7rM`CghblfE$;KEO)0X53iUxp5!11VgT}0 z$n6R|_4xNSvT>=DF93qGggfhb>wbQG#~E?K@#k8SYwigcS}lBFMFuh)t@Z8UL9^J9k7VaK>LxWtkWCJ znlm$FmMK6Z^5xojO#8W@0^wlOAF+gysuvSf$Y+zP2^kW0Hy28Ju(Xi^l;e;mqs08J zOymBJdow@Fx8WrL0G5M9DBM?zf~zzQ{Jv6ufu_K+4UvQbD}~cyEI+Dd1m@l=Sw(!T zgH9?iZHOPtR*Ti`ZcT872vSA@&ie=iv`o9Rg+L?i`7j}28iK;xID`u*x3AjEf&@(M zx93M|T`Q*R^QdFRIV#KhBSi^1VJ_*w}?{McrgaXi(2#1 zG`S(vt9dp!!P`TW!=nTQhvhZzS%vmKkb}gVs*?US}>Uok%>x~k(Mfy6XD zC43G2SLc;e#RCEMv==PEp0FZ(0Y@shN{q?6^q}a8jSkrSWnayNz>O3~3WDAg^a(v3 zlzYUmK2;;)-$$DJ;4B%&kIGR0GDL%P;Ynw8_FD|pANQG3#}VKJnCUa`tp%@uGK$ZP zT_WRsR)9c|yN-=(09&f=OqLFzjR;Q*x*TAp30RIZneo{2v2h7~MH4WNA3$8nE&HzU@GQ6kH zF^-6!AZl(dHv_axrf;XCnl?DFaw1U95;=#d;<5Vdk82)%46a!V0=s5>#*%OZ5964} z2Ceq!uWiqY>K;ZOoJ*^}eB&P)cGxyg?n|&m9xRH;I{pN0Ro?5ok6G_=CbqFwPgQDt zy=tsr_j@=dJnT!28kf@2hq86$pU}J`^%Uuo9z9{0BN6$yPiWMNL+W`|G*WQ~W=h>Q zxHhLgFMU_U`|Ymj+po51LZbJkGC2|X2{5GE>NSW_)x;-z6M0um*u3{&;kxYNFdaB8 z9yC5+Y(Hw4S9QA$hOWljE@bW`P zd7m|elG3zrqT#Ihrbr-Z@*5=+H!@nS>p$0`wsW4QCzG_#RSW2b_SIXLa04H~P@rEC znX1}KSOaAPS`vjG9YphwY2fcA@r|j3c}C)A#-3$LT>58=6em$#Yiv1;kC*!xGtsw= zEtfdpX$oNV!r(p`d2{I#!5fR((f+#sJ!zw`$;1gvCc1Kor*R4Flr9T+785{Sjj*X2 z8FNUx7pEQvL}4nX)jMMi-EpiZ&(!}Nwot2t@!fvSUC#gZO581mty2o!nYJ6xS{rK{G44&bE735`NW(W-oGu+ zvt)hzvgBBiN2+VTgp(loNP26wQK^$=_m_{zjk`%oPi9EhPkw{Dt_DB@sU)5{|keTZ@{XyHzi$4?Af`*Mj80GIYg~#|!2)dEm4rWSxTQ zF*8+w1$KE3&*UvPSlD5PRyt4XVIJJ?o#PaK|pB{LAqPI zq@+Pzbc%qWA}QSn(u2=j+2h2*G0$Tm^K zE<0HLTuZPJNupj-?(UV|VFkU|g447Hp6mC|#1(`_TXc;D?I!O%ucxY2CvkYDj&uuv z)d|qiJ+Ymwo%mr+-WEj@@1Uw*`Z7cl>IX8(GZ}r`-EHi0%U9Dtjrzc^HudD*cjvx^M`tk7>pud*h=5%LPA>5_5MC-O5*HD82)uo zNXvW=&?y}<{fe|G;Ip6BLf{TXm^K{!4v;Me(s-~Sb=M6*DmM^431|Ki1dOco3=A1h zPFhd4r-l@^(xW`_qOnj`Q~OOrz1-z=)vnHraL zd9?$0kRMjPfuelZ)#63E@{N8=puYT7&$o@rt-aoEvMhRiTs#tttM)z!^|dbgur2py zSeJBlT#>ONJ86K21|jb%EBE$y&6e)09TI1UY8e0^;|qtG!NUmeI+M@PW-3ydY_&Z< z5G=YJXyJpaW-3HOJV3fL%G3PgrylDTHyn(}=WN4zSX>~r_0ipQL25cKke1;gv+g)V z2V-)(*;=`{>qcyNh^7+6NTSANeZhEdb0m)u83S)J-&klf){%`MH`^vK_XIiTs0H8M z$qEMtpVktil=`a^-Lnb%y|>h9rh;(5r!6OEP4T-%3~qTzl3jv~vTGLt!6v`gQ@Oz% z)Y#@WS?5);WhM(yJQEa8k>PU${c%~7{hkPr)J007Z=&BiAXQ=m#f%5yi~5i7gBHAu8doqm93^o7T) zv%U6a=Y0AA%3@R&jwLD6uhRxqf;U-^ITS$@&&EN8OXkdB=Zm8tR5!ALKJOB%=-U>WVp1>pxi zX*JS*^FEzg)KhC6;4Z@NBYC8JD_m_A(@tTRinCw?qOQk%RR}WG*lSbqlWj-JE!9b74-B%KI!WOCwwv_zjzf>8lwr@Fab=nMAOGCo^8fU_=|S3f^cM{&vya< zG~u-y(QR*(N#*%5u_RvMIY;e6px0n!2X3$zwEF%-MifA||IBCxSO#dXqhyjoNNN3B zzx)bJ4Jrv@Zspb})U(PsGub1uoKGn^1T^aQtM%2}v5jp)C?f7R(-eST5@4>GXZ#YB znXbdZ5+Ub#Oy->^cmOe0TYsBHW_dKO{loUtv3~$$PEXN{ZRv};DRPIrBV(NODUBo& z|FAIUb75*SXoe=rI8U~5wrlzL{A>A}0*<9}+ar1L;ciNbiYIbIAiV8(iyn~L`bP1N zNktln-qfOSHPNjn;tQ{K?z* z_qI^28mOUwyFC_$Bu?v)q>2g|V$nn3c6-fIZ}C_aaC6Bq+X?vcoq+2NC?>y7g(0J} zyU0e$`uDZ=CuPqaKJ1^WX5k2o;u4?5Xd*h%Av7`+b^BbgfWnH3uSf4vaKDSsI0!r%V-V5x)W3UKk`&)#m^jOH`{ zH0mdaAkoV}ghq2r`m0Q?Zj8LGsL0#u0!>!=mXhG?CK}x@GBPjgXOd6Dh72^~ z2(VzDWW%6NdUUUdv${#`GsscnZb@w$A>0$(NDcVNjl#L*hQ;t)fnCSo*fAZ14ZY2TU%QfiM27sRe(02-2t;i)D#kGUTkBw8|H+Al9oq;6HU$O1N~Lb79PEx za(ED~^fv1C1obwt2PnUU$6KdraGdV`zNO|xe8}oEO*&Vf0ul@L5PLH;Rzg|EJrZkx z*-)q3dgigh^)4noXk`Cu_g3VBH@n3)28G}dTH*oo@7-fJiob&Qe}cp$vw{Ssg$E}@ zyawJ{-F2y%a_Fo^2|i3YI%W^?;-R+R0an`Wt|gR)goJT*SXL{l0>lc)GA_p1zfb5? zb0DlW6#mO0_)}B`-qR6Q3K?J^We%$FJGWCs+`9_Io@dq@g`*U{edxTh_OuZCyZq@f zOpa^|M}Z8E!GF8iXGr$m$rkMjA58gPJhLJ+8GB$p#^}F4Y9S3UA1z03XPWbIdInb- z`jY3kj^vr6@lAG~j|Gp5i~ZegLjWhL+kao|Kbgw!XJPHbkhwrU>s>=1wPv3wB?WD< z#_=GEWhH|%fbIAnEtkXYZxokHPWM|9S1N-2XZ5AFIB!K9L!}W*arFSY(g^vlYz*9+ zj`Ws%fr#d#8+{=Om8qbu^gELJ-#QYpZp-Ed|LavgIGXRvb1Ux)z4#?*YH0sK+++EF z?fdY?fF6yyXI^b2DrEBR#u$n5^d?@b#H%%dv)Z&!*4g^0gWfz$(ifk9H`*r@;;vN1kv z*u|e&+bJ+_eXF_}Kb0pRm-bG+^2hO}XodgcW>#qu+th_*vT+T8Caed_*<*fE% zI9M&dIZ}Ji@%@W1wEZpH2*1!uZ|WSl5difPp@cs{1iS>RNQ=sd|Ixvgcc8xPZuS61 zlhq0T_2$E6{chG!r;7i(_Oj}13jkNR{okPl{{KV&e_*UP#xfSA-`+e^zt7mj@Uvy< zTXd@@Xw)lJYC87iUTDt##5b-D^63xo{(Jf*@~G!aDR)uosa6C6Qdq3Bt7g}fbewe@ zXTrs+fs~elR$-9ezc>L*GGwz&@l&qf7g#J+w)W(P8&qE#*W*PW(Yso0bdJXU98MN}SIRDAcV%|OXX}N6*5KU^u zn12A607*_-S^iI06;`?TyM*&qT71DDfW@B3#ZscRw z%XsW$oGKVHHNgDuG#yqr zRK`Uwk#9HNua%^3w=u4qI9b*)h@^=9^1of;&`nx$Flnt5h{Y0ojj#^i-CM zN7lTzOe~$waem4n;`nAN;9SUkFDraXz|g}EBwPzYW3d8uo{P8I8&YTJbANQsISR?Q zG8r|KS+`uj925v*B6b(hzsC zXnr)mfR~yL)^c_y&zWA-rOSk<`dzaVlW=l~C(o8Yn?2P{@@N$rJsZkuX??wo5Iu43 zLkAS7aS;=Lt+BEkE$7XG!Rh3Q3MI-<{Lk1zyAib3NSHagVt5m!QyS2_8T++9WY$-q z^P^G<-O>8cyu_;=&(z1S>=>v-Nf(H%erv>q49tsYg1!-JAxtMx+)Lwfrr=BTk8dtk zj^@|CRzeYFJZ3yTr9X}apGBC-hhdNq|7Jw$uIEIA1{fQ+3e27`i`=ROseOH!#2wQr z4$@0NkfZ1t;Pu-X=Pudq5nNtXciohS!nTjc3S9f$Wfv{&(*3en0~*E4&7SEjrpOaT z`Ah=r_tr1o&U;tmNUq*0Fpw3Q_3D#51Rmc0cy|*?10F6(LHz&~ZXzawa}M6lV4}L( zdvJ~9Nm2`Vo##%Ln{iTmj=rwFhH3d6_L$i8B`qW~tG-;03MJe8r3@cfdU+O$(iI=e zVn%iuu?-~a0oMY@`?_u)%|{#bb5~_)LVoV=o9wnnn;b_7dL|iK2h{mB-6mUj({?mE z6gV7+5t&YX_>*vZYtHcU@M3qAd;7u>kgmVxJtn=>r}1O!$*xnj8~ZG>gV)7T(|wL5 z>E$#dJR#|W=q7WVui4E{vLV&*Ke62zlui_Ne^KZRXewvJ3U>ARD)${NH`G2s@Fw3~ zXlBp=E}X63CcBlraiL@1JSCu(n%F^QP!%4I&JARN*}W93N27!&#=^#pWmr{Ae7~7I z3)8o?9BPHz%L6$IG$lN@t6jg*RVts0&Z~u*v_?7+8nLrx=Pzo28A30#`g17L!+!%L zY$pNEL&<1N+vI$5iIdg*Gjk8}{^LP8B1{v+kE+FIDXx^mH>EntOjRk@{G8p5d$|X_ zq=|b(9lx6_q-^WEfPN;0B5Ji3p{HC|%{Q9OHz4?%0x2^aRifACVJsk+u=D%Ri$L)6*Dc@v%BP7(`)b(dq~7d)gE}%$&F% zleiFgp~FDa`Igk*PkH6oiTBDwvz;Gf1~E{DxL3GF4?RlhTd#ty2Vyv}^Vn@6nKb(=YsYN>T#?`r7ir*Rb)dX)aUbRe~*y6Q$~_6b1Hsrp{w3S=kVe> zL6iBG3~7Yv8!mFQy=}+U%^jF72lS~-V!i!0I;>aHN@Alyjj{}LzG6(6FD-R9_LUIF z@!kh_G}Q7Q41sD&AO-DFi!WB}4O#PUE|~a|-Fx; zVJe3W_IvyHZkc_8)!ij$re$43;{p1`bpC&hM9(|}wXgG7;b^*N4PGBW^Nxg7{Q4DS z7S8F}V*GKNyOaxlp3?9=2*=EF@A%1&Y?ru3SuAd>`k8swoC#^J3tZ3R62mvn4W|Rx zVk{ScZOsxDS;5%qJwjO^n&;kr-i9fmO^Cg=)!*e0LDqp;3+N4b+%XG9nVw!;eltanHxUweD*A70nGw++=*2^c~|wUe8VLo(vts zLSFAmN=8#db1H@)Y_V^UTj|P0SYn-zT-4gRzGBh^qs2EpHdDC!CJSTjsQ+~LH?jAx zRRFkCy_*OmYn$#*$hcWIpFlzWi_??z28+Z|A&qWjzo~~ggeLB6ol2znAXh!(Mql^5 z>3jjD+80l1c(`-Z<#>>Z>9R!vX7}xP%kc142+PWG&lfLJ@1t&G@$AZFM-^5Vf2tg6 z;c%h`)wf%%QxITd)T>U5C02%*^~zulERl*5H7}VsJX>`tX8o84cPCW=wcTS{4r**$ z|Fvu{UN8)OXsSPbYR)jd>;2vb&K}i zuv&Y1NAZhr3^F_gWj5gDSHc?X+;$w%FX-Qz6PR}wQkCu)F_Fe~XX%GQ-OHk?Y1s|Y z)XZe{%|+jzJjY~8#YPRdEiD~B4c?|33UQGaIQy3;rY57zeAy*pvq5jQ&GtxEw31Xe zP7?Srl_-W)#inJ0W7-5zA1EohL8#CoP_CHu&ELH}J=sBN_qBgbSBvJ6nv4mNW7uK?+FiVrFa8yP+~>ToeY0_3;i|L0&p^P zDj{~b#RQh2kRN`Xxr5CydVy|VxH7t9&V z0JGm(>A%A~aE{2odWnG18PRVOu2&|6D=&)4F1H=(*$qJ&1#q-WwKM0{)uW(+k{CrX zxSD?Uu%l2*-fiTXN4cGslotkMJOU&SLfZ$_lMY&@C_~t|r$EKfp zYfYHni+g)<#oj}%r%Ox^*sISyU;pa)xDkYzVs3M?o}~{~7v3o643?d*d5L^*_qp4%Yz3TsavOJC!_v|oeME{S{G88Xd~{V> zUy0+;-hv=r>2t`(q_cAjx!dZb&uTOB;gsKiXEpkxpu;3WwNFG88kOYtZL{k;{jt-d z<4i;SbIenu6vCN+fluAmZ*!7&1nFEs5{>hc$ZTif^2}v5VeSaNxNr%~&X=`q08$W{ zXCR7-EHphqD0JN;xGQS6jL76MGU#$(&3-9{)Qt zov6{Ay%--upcAk`aaD0%AFVT7QyW&R}GA*G$nRvO&pXb7;4R(<2hP|t3d#2N!k#*h2V2P_y~rn zlNpLD8Rq!FhC+alwWvylqpN#g>p;YrYIa}p5D6LwV!#(;0^NA zjzy|ZqPo-sfoDLE#^f5l?{uZ^x*?}pzr3Ecyn6rsv(5{W2Zq;T@?3i$+AS7a6bp#f zpcBcn9wW9m+2+MUM1SHnqt357)&N?4#bSxPqpBCpr;>}**Sqe!b!Dvs0WuvMp2sic z=2~T`qmzyIqAjKXB<-vgWQI24?XsUp3}8X^o!#^4zK=yy$Ec7^1qD#kQ@X62w@H72 zD+9gmqw&*%8Vq212#FbJbgpHLjcH0`U^+*QU!*&)%0ttN(m`MeL51$bzDT`w)J>-* zJJf>;g-1;NTE0jG*#%12YcIZ9 zwD2}2+-S7i+lJcWO3QZ1uCKiv8dIO_|4}fF^L?%X1*|L*fvEX)n;ed|`zG3sn^TXu zD7lUjGEgV?5f=x_;+{)Kt}~#+v&xc%g{W?gC@|CJhi`rntPnF@VJSvdO;*9i=-FA| zjD{IAwLq=b^%@va1KAX&>$U`P|Jreiv8*5VaZ@Q-dUoml&L}6vB;5<&LB%ENSl|&n z#dvRjM!gz{({L8Go}@T3B;U>a?Da?)CHSl&dGXWL1-mLP)Zx1KHC^ml@|-X4M+@0c zWZuKL$^q#(o32O|FH{0`E~iaR0LuyRn#j)yk;oT~txODCLtexz`56`vG zO#(u)KLV>}=4n0)tDiXi^xr%XNFd@hndrMSDejoYy+#t@Ju_dFt^89Sz%P@CQ@WqH z<4XABeS|K%d!piPWF;qmE+qj$A<>mys?TC8+3H7qGxI%VaYA;3mr(}5LKe-DD*%H; zkALX`H$vhoL_A6eW)d)d*g`_x)&Cv!$D`}o64(bpC@vj@Bx-~Ai}}!V#sxkEYdKqCW#Y5>Z?4XM61M^1zYt9g6X zBOx0hzh6UwTR8GW?ZoVxX8}9hyH$bIy)5z*=iZo@bC5o$1SsqW!F8Jvn^yKj*f8ff z3JA?DLnZrZrbr;UQ^MwMcv}pdg0ZF2?&STpR@(6UfJvfiWuigV{L8^|NM`o1Qj=RX z@~UP$U~%H-s?oVpzBYr`Yzc0H16J|r$b7o-+i|~(Gsy$=wokk!r~Ro;Fu#Z9jOF-m z4(_b4lV1+kHqu_z@6^vJbOrM8!n0}Q=c*u(E+qYj8xwQfe z+#ba_d#+KEW3J7N#an28*PXT36M5~rJEk7Xp}S!5*P(3&j{~6^~~;Q*nCgzW-1rdl9S8YhZ1mCm73%6bR6IKX*a#nOTW05dziQrV~)SyKpkb zZ)XicrA(j8ci?d9tD`z?%1VE0>!}vF=a`t|`sQY0Z(6BA2s8W{A917q4BrfNid>m} z{dK=aIaWnJl?f?5R%yS*qF*@7{f{mkVSBRp^}1nvq#(v?E4&>ePO)YEkeMm+G0bJQ z7T?nwUi&#NaTB89o~e)DURwOC&K^WSMpSjy1)9njZcGn6V&$sOozhKGBy_*;)=NeX z^%s$JddP20G{Uc#tal2^n`4OxA~st%hA{yKaw21l42kl5EcFteTnDLlG2onz14VeE z?^6QAm^=yAkr>z+gv?BL4gI<+Wtc2j0dF??u0{h~RPwog9UG=}4{Z0xu#-=s|(gW;-dyd}#(MW zcU9>IfvAG$O(32^z0;S$mwwQQI~#@*eCfv(b*V=H0ULqk4Me4-R&WM`QM ze_<$G!DIC)rxDln3TAS1K{;ZXa9O}OkgB$uV>ni4l&nkQ2{(E&#n@vFHJFMvR&hbP z=5GM{6Bt15;`s3%K1+X-g&5&{)*80g{>tz7 zB)7r5fm?3W!_(&OCA zacEhqOik0vcox1)C|xD#VY$ak)Oih5BpH=y)`*bDj=xJr#QlAYapvKdNRo(fGB%C^ zm02y&WfX}Gf{3dgpD*241`D>d=gea;7&DJNpU;pdK5q+yT`rm`pMj2+6QDe&KH`Ia z%Iz&ER}2gYa2;laAkQPyNXva(8Fth;yjqV%7_V7m%LY2(RGpI0XYgDC4#aYe?m@;hhFNceGkGGRr8ORmJVb&4x_>D` zu%xK6TG7?iL(%dNrf;^3|K$_0bq`tPb!MO%T5WxCbZhllEQf${5hj>6w@KoSJG*yR zqIg)|y%>=N^wH~S_wtfQjKgQkCxhWH00i;dDwT8wqaN>h>5I5Jg47<>(=aeKd%vyp zG3I#2r%~n`in#0TV#hdi!&o@>geq@yH8cTdD$+8-NsIIAm^u9$o7eO8_A zW?=_LA{ZiW`k&v^YQ?Ka?Ysha*8jGD6TUKj4ZdHYCSGdhY;qloy}e61xc25wK~o@Q z;`jIIkx5QNK#lM1DuQSE84bd2LQv*DzPzW69PV$u`{jpp8Za_RsL~n3??F<(u+aS; zPFMI2h{sZ<7^S{*jH-k0S=D_&yI;qhT~qn}j2$^B*uJg(^Qs86v2YlhS^)YjaAJZ_ zfE;Y^3$k1GvWir;YJ{n2n&O@Y(<2A3|Gt!ePAXL5AvG~aR?AB=NxfU(+#X87z(uQ9 zFAOr&b5J>`k*20l(}=O|*t=$d>j8Ge+@Q)J-o&5IiOHXO39CIf%W9%;GSU9~H*r*o zUo_5rel$*dZ?$!IwkXeXG&A~td~xRR)uMAo=g08A81oO{bjKbePgmt$uF=w{CaRd9 zL6Oiq{h3)GiRmeX{ePda{KmuLFHAbM?bNZ0;CT({N&)slt5pIcOEeb;QBe^X_Y#h` zrlP_=Qhg@ta+ctO|1qNYotqNU7+*1Paj*LVoDw0)l^7s1b9!4`T+Fe|V&JqD6k8M# zdQWSlTvK)+SJfH(Hd8<38cCl8>h08%^?voR&q#w1A#ZpwZ2acM*?Ly8l!8L|rC(bl zmD3q;D_IhA#srg#=J(`!_d?N7C*yLRhe+P_JJBK^yji_}p5VhXLJk#osff)YVMN!5 zeo~jIL=w6~1snV`*3A4w>^jrWlrxy@ZR~s0 zymC@?Jk>=rbL;ag*I%NUB-tdT@e_O+^{Z!{nVP*w{O0svwfVv;>D2K=$4i*8tzx$P zq+)da1ow-RTl>gwBASj7U(VB*2M^K>fvP;W%_!R$0M`wrX%-oH&h&f{lm=iw=~u6c zvo#7ngM{xCd6D@SbJqyQzWC3db9Z+Y!4#79-wiKktHIo_lJ9AY`6=(uF$#4Pwg=Kx zsF=iTT?4dS>R1vZ%N`&7#R^-Xo*|44`;z?tF6EofSPRdff|5b#zbm z*Z`D+>=X`{qexhPL)40QaW3v9M`&k5u zZRZS~8|h`KS=VQoJ^bNku}pM;9Gkg&3R)Gx&a1s0W6mqx(Se4GfI9l|)a795X;p-u z+txUp;n2J)jMm#C-)urVxU0l3FPdDZcF(if#&l0(+Hx=1Yv-YIvC$LdOa(Ii)YMdZ z%|bmB;2`9b4CK>;0dcic=w$}r4j+(`9vA3Vbk^&Fv9VJSPP&(d1H~l={7s^_Z{Mcr zRXL@B|4#$>aHkhHr)u1G1t{&z=U_eudca49cg79SV9mB30Sx*vUJak!q*}gig(7UW z#;xcPV^X-7LRwndQQr(T$XQC=fl*L?ivcQlyp!&`JubdpsV~p|Qnu0%FMzG^dXUv; zDio(qcPh39I)1;w4-=EZfs z)`ZZko?RaHEfd_}b1C$PBs)pLd$qbs^YEgQJzxep(Ix?Iw2BnaljO>g`3k$KQMHdB zKR!4=*=7YeRR#`27+VE}fGra}J$>_65x1i&jeIS-h-<)rp{?MDM)fuV=H&6!n~`~k zIhcMDP5KElGxNFcSfM@vk|3}`l@oB%k0l1-37=D1wbVS?c&As}>P)*Uq!d z1&|s|mjDSa1kg-2%ydF~0jgdq1IWs)5s`!bdR7-c-?S=#j4qq5cB{DT$&L5*DLuN_ z`kn?bvrRx9GMt=WzCQ9hSdSyo=E!WcU#oZZJ`b?oi2~yK(DEBm3tBPlW zEe2!+!m;HtZg9fTG$9An>@+;;A!0~nlMr2DBx0B7JPX5 za_@#r1^zXMUL~zKr`=9;Yf}Jb3$R{lN#HhJdI(VY_rT~~12#qH1sJ$GcknzsSEB4^ zetfQiBy`7wU0<$>XU3*?#4vWLWq=i+V{zcAFj&qE9B{u)x$MsBClTD@lL0rN4a|XN zSsI^vy9B!=N}^m97&%hze()O7C!qu?xo`G;MP6?)i2=U!Py>z_6lv0wE{l8S=4ysC z;-te3N_ms=hM>T}F8VW-?7`tmU)~*J`~q^oS7MFrdlk{Y5U^ZdXzgvhhY1|EdW468 zE;;_)yY#@M#LU_@{2_dlkcyA6l2IZ14jFJ*`1lL|3`x^B9gs7<-8(KM0yeAV;JvRU zAI${m5I=ki0+22lBOf z&&aJJfKA5OrlZ|A22fKNuzK4lE2)DHyKSILag_yEFmWpRxvWGdZb`;bH3Ljgg?*3rI#KC;OgqpO&k<<}d5(;JZh zm4+lGRlsmJvu8O_CJ<%88j z=Vb+rxlLwu^xSi%(l#wU%X+a_7eX9-PMAp@WEG3jZ0yWCkb zUV6O@A2dqYteXl04GoV$Cj7lCn9OAowWBhhx4DfPP>{ZDHYHNL@LkwNVeKja<#qHy zsBU_@m1W_8Wtl&N-((J$A$7&3gsZspmkczZiHf1ViS}u}s;m^MquwQeQ02&@Kxi12 z$rCW&I9oF-Xc(O{rV8h`8I>D~92t3kGr-0s=hHlh zBuZ_;8sNK0f6`C#l&y0b`P?D?lm!SOf}U?&m4S;c=wVZI;zUE|Fzr_Y<~({KJOp zs10eA({gD3_s@RKR_TouncQU9Vj74ivd=dAX7C!|onW>QY~fDmxI9cONU_1HiH z?8mKy7+!4X=?bHe4w8{tAJhG!=ow@jqy(fiKr_ov|FfeVx=?zftZA6CPy{MJJfS;K z#<IP`=jhN z2*5ipMkZ0-2!E5d>S7%NPgF3Ticqos4Zr%D8C2H~G4=ToF`PM1zsM}~C>Q-5<#3I+ z^5;1E-nQY>Kt8PZc}sAq=@>MRo?weTHee|XlATVF3T(Z}U{R>UnEbEE##=o&D>x7F z_D)Qvn8A-PbX#IfTu+hrd>VL9soYS;0eyat<@P6Sw^j0Q2f9PuAwwZ$F<|Mp5L)MD zh0u&iZb{WP`;b1tz*Mn*#p!3ZS7ME4!jJ^9EO7)v9!_tho^5u7HlmW2HBtgehg+{W z$Hl?dPT!pdc9gq!ro5%0`gs#0p+2MF2&(S;ei6wzt$GesYEPHWX-cJ&HQms7rhU0+ z`z5eb(|>vrVd^yjV!A>BxFw}=e}sgjxqeeOTv8L|q9NZ@TWHCAL8yR^rw~x^FkEF` zL>b03%OTw6d@OREi9rH`|4YP_{ZN`NvUND`qelMA0p2IZPt4f&PoC^sL2=U~FWrX8 zG;jWVsmt~YEj>aQE&aTbF(2`6#Apkf?`6yUt)-RpAtQaw>i`mi_p6Qa55d*mI9N7b zaEu~#Z(I}g71dRa@< z7{ZimSSc~amv1>N8c@PqrRg=_UbNKNPyOgIf8YKUtkw%Hgv{iFd|X2F8bd!Kn*IDf znGU&od?)AglCR}Gdk)eoP$&Qvo`0oA-J1UHA(domOndgj1o)iV5_t?SZJ@gxiitvW znhA~V#~;~s^GBK^kO#iH{gI6Zc8|7QoGeD;cdo*HC^K^9WmiR+1lFju#y-k_?#gRA zF?vNv;O%c3p@qee{UJ(4DxfEI<9%CHA9g*{vqzXgLwMBWOr(i zB72AP;NSPU69AvHAHTHXwpm6Mh)Aqp`q4;`NmJ~Y1+h+!y_WkD=GHM)xE#k%VGvk5$3iT&z?4dcjJx8hsJA~N2EB}g`KMA5c z;_h>TK{hxRj$5VW`26$Rv(!dLF>oL{*z?&?_09F_V& z)0zJ@P!PIa^^g!kP?D^sQmvSjYJTFZ<$?*)@GNA}!0Y{YiSh%VrcjcnXqcQIMpNBv zxGlB`zXT4%D`*Kd{nr7#v?Lx$>PdRD8nA3xZ_Qa_Nl)_zJ*kj@DtRLd38iGe ze*JPyGhrz}$3mCOf03a#Ng-JsEUhb6V#PI6kWC)9^i9yFN(7Wq9{w{|pVHaW`KssZ z7~B>mb}!QT+Bv75Lom@#x&H{v_}}B!0b2^sr5?ysacMreiST8PGo_clFeq5Ut0Yuw<~T@129JxWX6obSvv?{QR4;ZZgw6fF_FZ_xXG9F~2By9j7>woL^dP zo1u~g#klierQ-`@fkKPQ!g^~-+Vj&@>G}bOU=T2d>?cRysn(M-toR8^TAN+23SgIu z=ByIY6zp~LBz+PNW6`Asmme${0fD*If3F3+4$>J&DY2q2;S>QRm z!@M)gN~3*l3)26woaGt(_-^CQy!Vh~?!W^gE63};T@Jb>{8mnSo`sHa$O!64HpP{h zT15L-Yf0}?mzEDCq4CU~XTUDqEp^(%DT`(O2{etV^B$=iyM{kdS$YFk{-M&L8q8Qu zJ_@f^O#dvIZkoQ4b8*fLYY_?kdiqRRjMm!?ovzu8QbWB_XYB#CD!4bLI`%y!I4f&pDDjvBs2@0gwLwq zpH6Lg-N_VsfNfWwX}9kNPQIO zVE9)?UWI9Gms+^QFh=xDa^>gWhx&nul825+0e2{JB($aMUObzydI=!~e z3GJwcr%BswBo5>V&f*1ajFvNJm45x_mrlbYE9}`DcEgB6Sb^9`b-8OEoA}80tRC*! z+^@dA2T4k04i2hrv=3xPsj>}bGJDBp0GHx*VE(e&51@k!E@^>{51tflzUOx5&cyH) z%)LYo=F*WltqVY3Rmxzg0mBA%6)%}jWS2t@FlH&8&%s?^H0>F?Kbi8h*GNb#SzagI zy}u-&Vifu4BEM3|F|D%BCcYA8|5a!*EuhCEq6wj?SxD#xC>@%h-WRknR&RKES|2ua zb^i3Huj!(UqX^hpffyHlJY!UkM+zI+CFR5B9D};%<8mA6g=mHYg9s(o$O=1F#n+o3 z5e7dT#^%~WBB_?^=F!_a#tg@>5pT))rJAymLo4;Fj-2%nYYdKQs*FAt{Xd0i=OVV7x`J7+Bb&LJ zIbdpZ&#;P~L-F&yNCBG%zSj$w(H|4pPCl~RIP0%zKu>noP#^_#RT_ta1GGs2*`&Wp}pcRkI#1d{s8$!zwR^IjB(fVbgFVE2i+$78Ow6 zdNr3_&t?3s_T>0@v!w-B3=h@Rw54NbMtWm=YQbd=R(j}YI`rTS@k+b~lZ4ocnR&tJ ztMGxlx9|LDQT@hg5YFQ!ppeEdHl_&TO(RVh*+1Zj4zd=B5#Rx`A>Y$h@Sf- zUSG*)7Ahy~XK*=M!@^$texv!TooA0pO`)k)QAm-DNZ=5{;MOPf56_2Yzl;m%~@e%`RB zBpmSJN9Z(vnH2o%8HmPt7rjM5B^>NkfAv5%h}+tB`t%7F%={JqeM6Y7yz0R1#qc8< z8;rhv0Xu)W=W#h4TyXbLqx^b4*WAjJ?FxZaXlZO(y0u}3{2=9G$Ss+TeZ(H$4OcF> zAaLrMBVpJ?NsivH6@*LJjS7@+=B`IiTOKyhTwfJg@$d;_%YEgUC0Hi@V=0+AY#5c#*>{v${OB4IkVBR7t29hK#mIy}!l1VdxXJU=bfJHS?VHOL{aoKQ~d) zrxwLWqzT?#n!hrNr7Rj~Y(#?%%rl(oi%>HQp?Dm%9?}OH^1{7X)!wIX)ksQae45iO zOsp(@mAduV$oHo3^Ko2;UAciLtEhP-sitqT&>O(4g&juveq97b(M@jD#_Dg(gYo&h z@lI8LtYBMY-fII>LKyq%800Vj~dNBZIs3<~_c+68I9IVPt$2AN znAx9;u{Y#OGX1=*4Wv=VTQMHWZQszrct0{PDd=~pjKDqts-YtH5?sRju7TP~<`vVQ zE7O|gP$thR^@A~;Ai|jDbXnJ-umQ7gB`E9_h@D0#PdwGJoZG|WKEKbsH-;R(cITo$ zAYx|ZIzC9WwwT{ac{1pdPMtSubSA&ZO=>K=Q+2F|7cMdz#vLIygkVvBN#$EbGDWJb zbhe0Fb%2d%@u7`d>3nymlzN4HowwC_-nS$T2kX(}PV5In6pU5$Ei$uEz>F`aQms~v zq%4(uO9Xx8(IpX8hjb{$K6SO3JY$zr8sA`=_>4VtHPZYXe>WAiBS|_7f)g;IA^ks&T`3?G3*S zGR|tgF)VYO_B9Z^*g2$QAEAT{793H^mA{&0rGg0JAbWu@BZgn>uh0HqmVa9ASZ8Ku z3nGq3**;$g?^pc&-V^Y-xXFeGQ&ukEz3BqhT>1jX5M+>|SgNm_q)ssr^P1ccNW;6W zj|oYK6#o@GRR*Vreo(Tr70Hd*u~nx~w@U-+?ioK=J9Uod);H=S3b=!jhd`dxfB(}g bc`3g5oO!oX&awan{P$E^QL0eF=-vMTzn|wb diff --git a/docs/finn/img/finn-stack.png b/docs/finn/img/finn-stack.png index e34b1ecb454ee25ffb64df44d58e7aa50c294b7c..c2b49de57e8c7e56c0d850d8631cfca9d49128ae 100644 GIT binary patch literal 82992 zcmeFZcUV)|*ESs6=zz+g6zMZyL5fijkv@Y2P(}d@5K2&x8Uq5-n~sA@3lpVC7a|Ih zfDjTy2-xV&P!dTbsB}VzKnT*m9T0V%=Y8Ji_wV;z-?=W!NzUGTt$VF?ueH}XC(#yX zOm+xJ3P2!`9VbnV&O#uYV<3>9yte%WK2f%7H3k1{@H=Z_2q|il9t9_x-TpZJ2Lw`* zAh>vG3poGzswvzL0#SRx|Fc2yvMmDwS;;$T^vC%i#Q0JVO`{#o40ifSy?xte(_eqP z@r&V|le@P4_S0`e5|`q72Lm()4+Z+$?MW|X?7DkJmo^q@OfuJ9;QWG+ZDTz<(}zY* zZ{9f=%BfPAxYH2#ejyY;1Fs9oj%(SY7C5xTcs(p^xb3&UfBF8`uGZ`m-#=+M@kS|Z zd~Wf8Yka-fx9b}AIf)+AwR-umPF$r;d&r6!1phuHIp)F(MR?RNB{3?F??z(9F3H&8 z*?Vl9UE-0(->xb(!o7l#R0FOE$}|3ozHl955P8Wel0@o%D)ZQPJKjg^n?V1Za~4%8 zEq!Jwrq}eBDO}}Xtg_?ym%Fv_`l<^&>!uRL`UWL27sinT-;~hW$B+keyIv7A3XRG^ zy&N=C4(s^~*RPTkUKV3wMHxZ7bk|w_cKL6&UF|adL2Ym@{=H0f`{2e?hswf&KPm@= z4d&vURR^n4f@X|Qmhhu=vACvOO2DCSIvDP3Bq{!8b;Fb=V8nQ3aVU4dso9$RWlYAA zaF`Y3mwnmpJY5X!DQGMb`)!QxDN^mWk{F|k`xrr^3e)pqcSyB4rHYBMs*VevSQB67 zYf%ZBvyWBwCq-dVlzf#vgFX*!jMj($v#(Uus7oYzLiA44CyPS@NwwMDrc8@3#;-M1 zuY}@U9J|U(|4EEBskoXLYdkApJXjgai-CRDwDYXkZ*P=#4ayrI+SRN4OHyr)y+~7S zwr2I0A@RPcPj)-JZQt_tOivCk=X+P&FH@Bs%*9+dOji~gta$K)6erG#{o3@(2)_5M z@N1=AC_$q+hZ8P7rm)1)FVjN<3`E#MZ29|1D(p&vvRvTA?fJymlNCXUf+j8$ce}&% z>Pq)woAYhEzk5?+;+8w5re&%kNwD0-L%EB>gyIG6V}m6D{rK+Y`yS(@+T5v3ScS!9 z(eFSS-l_+R7*Y@zQ_daB9EF=bi<0O{aC!JRHS?h9V>qKD4EXpqxt zmeX7y$Xiaa`C^(m=Rs?z_XnLU=Z+kW_cwx0}PSG{AC0Is2Pn^5FyH zL@-chUB4cZO=9E@{NgsrsP7}B@0PxMsw#r&JDq6Y`-rozBjpGyG=M9tMAk8mO5l{n z$OmQCjk&a(SLxW}siN^nlEF54+Y~5jsVKkhAPr@SwYTgQ>7kmI%MCr6`}*$Yn{ufG zI`;x?VZv2soM+TrmBc)@Ya~l{E{4MfD`PzMBff`phfJ}>`Ra7Vm8MkY++71-Uh)9D zG4KF~w7&Au0Lj-=3hs29jX8VAY6mm6~)@P7p=Mk?V6X{K^=rQRi;p zzCQ2o20D5x#2Qm~ZJ{WaUEi)K@{PJK+mfPk7Xi}yni4Wm{8_Hg5fU=?^`$KSb3x=K z>!=mfnWqzxdwk^-926OZmbBGeORh;*tVtgU zy(eeN&Ap{&`IjVUz&4(g~}QHxS~gi0Bmq}cT!CNY3W^R#3zS70yeO2L`F@nY#q0$Kv551 zu=y|WxpQ+aIf{{sElMJmD&Nco0E_~FwE5B9fYvN8wZAQx!Vvabz^S#vV$b}7EzxY6t_i=AN;|aLVRz$R1zsrv=oau zU)B&MFRIRkQGqH3|BUDaMlks}c;F%?Tv~ufpAhXm`e@z|8DCPD_5*F~jI~yzFCVRr z5ucZ5p>j+en{b@>b6eJUV}5Q8M2+NmL)K!mn@G$zmhyGS8Dgrl^HoGf#oK*z6jwf_ z0((HSdaE0>=U`ulNAZWFBIn|MT+qDC;$a|pz>t!?^a(y?wnj2DvC-9U=j(UJ=hU68 z2}_QYcXVEgZ=~%Du@?VRXty2A*>)CGiBai_%0`RYgFzjzP+14Z0(-= zQOHvv0x%`d{AZKPd&OfSde#7GGQ`v!7LTb`2Oga!DDb06sfLvD{Yp+F4p1e}#ou$~ z>u=vp*eepFslM6O88`7Vo`5tF<2iu0mts?)_{jwb&o ziLJ3%OvJGqi}MHLMSsvpMN|NG;G{Dw=gv3JFpDyzob0gq(OJ*tSX|r1r~V&5-1Yx| z`2Pd))oU95-|7qp?|NT>KweCqzpNN*r1X3KFO&PCV|F>FhMk_8Y1G{3eTsQE|NbPo zQE0zG+M>~vUt{qPX-brF6l8eLsiSpDO&%>8JSD2fp=w>q`~3Ums$xl76d;h#?+(co zo`XO>go(tQ$t?+yP#1tfAP;E!oMfII=o^4Qt~=*9ScpZRzi#oHp`6Y+OGCNfdHR6V zUCsU8XP9YV>4|~ccZK#Eq`nt`WzTm#M!bB?pCvvs1|?}0KdQ4Uz9hz1(l4GI;|~%= znw01?r~ykaZiYaL+MlGC*~(LIB;ndNL9Y8*C~8OZ%Nj+I4Up^G3Wt2#b4qqTxa_~b zuLX$VSMY-v$~6u1I~p|)^d*OxTI(T$QGE|f`Ro{M8 zQ)vhl3v+q74Fb_UR!Eh%a(Hx9k9ww7+osTY?$M)UufGAt$CZu_Y_yy^BQF}=6u5h| z(sv4)MOL*u($7zu0N>0-rzuwU;=lpNuR8W7>|51wH($rWSrJeJF3+g9UVtBNf&px= zp0<)1$U=qJK6!oT0wGll=~k+jKlP$qhVau+R@f#0L7gQgM`hPw;I7e3<-y8muCSu0 zDtBM=yhoU*dbL5Y<5Yj+?cJh!5&a3vBMJWn+1b5oD)Lbk4BxC8SujJgIWHCm6!C{^ zU^+;ovs4F?hC~k)_2_ct?t?HonLr^ZvN8|?Xn@=j5Js@`~V*lLWK)L>8w0YHE=KQizQitzUgV5~J&=`Ph^^wL} zQ05=~uC#-d-d+avY?F4Z_0iI?Xl_F0Am>0wAZtV?AmZ3u(g0R&8)! zZj1+BzF?aUj>GLdgKbH**wTw&dv#kRmb9b5hxK8@+IMIsETiwFeZ-=!czjc!B&zSy zfN1QigHjj+Ws&d!1-Z4$Ui@A$ad)I}gOQv;x&B|()`!!qRFtF|;+Z?YxJ3-Ptxg3w zY*AFXcYeZ@B4jtRAkh2@6Ju=kvO!ra$zRsvOht+0`F;`5V&EV98~js@3_O4HHs$t< zF>)z9Ys<&tu}$&5&fl(R^iLVELxlO|IzH@%<|f0i_J@`F97GO^T>u-XC3w!rYq{ma zYqR~-1g>S=^EP8l!EyKx zAgZ@nwL&m|fcL5MhdXrQxFuzwz`ku@`=X@8(*N{m(V1Yu?KD{GTw7<4BF*7$;NHHZ zFt^-X%R_qx1NS6x4k^XL=J(hx|HYyQin1RqzD8k8cbroMW&y|jU%?2@>FNW)U<8G9 zF|p5~%$je!Y37Bp^eLf(Ao}@-^j`jT=FEFlI>FXw#p1Y_GE|AF+=)(c`(eiu&2oQK z&KBjCF8gT?%AGv4D_ZjW>+(&@1=;flS4Ts(C$bLli``QZhb<|J;&IhFJz}xkBS*vC zESEKok_+<}osgVA$=!4T!l^lsjQ)$pocc!KL+)yZIyg8x!_bT;#r98^z43+D8$$PvIza033j) z+AE&LD2m5)eaIG#iI_SNxod6FO@mC{J~Em)y_N@9b7Vxu~-+(P&HZMG5Cmdi!*tbVr>f-ukD zcUZ0?&$m^!IS8jzBj5>ykxkJtu=&4O?8+Ff18jav+2SN2v?jtzEXKEF#OsM5%wTf2 z!J09y-{7y7Hz0mD+w8o`d?Oei?ndD`z`V^!at57%`5X@(u+S)Zjr?2dM*Q<1 zV1?rSH5vIf4}pNr?(=PcEXu&PD{%kXf{QFSK%A(Wx1i)+RqVwWf2o;O34ytl!Tt_P zqM8rY9B2e9nGX&-8h|P`wC`M(y<$^jSh)o^M<^U{8UI)XG974Db9d1RYz~*M<$|$b ze*T#>$Dh+MZ?VKaymY@_$yn&c6H0oV@qyK&68=jP_Kas&5*zorE7 zexT-eKb^C-#s^QncMcNbcmU!DT{g84zhM}XWvz+m^+f!GZAW$_G9 zV-yU^qXL+c9NjY`=f>G5;fnL~pkC+h2AfDzZ-KExA_ae}tF-FE;`e|t*}yK)vgMDL z;|(cW4Zi&GgCv~&01#`%CId{dA;oUj`q%`1aM~+&tS%K^Y{AWp7Jzx5;%8+3VebGa zLP-QLKPgInX(}~BJ`!A0H}KAolCbnHVeO)e59|X^8jHqME9HD;ma7_kP4fO;SB$_X zC)=o8p|B5|)@0(-$TuO{RsPG#T_gw>u1TcG+Qob;8?Kup6`c6u!TO6032R@kuD>p^ zE&&Mhualw+iXkfteL|}juln`6n4epKY5(Vw3agxva84-5T>OJiu-_C?2VV5A6TWt{ z)wATaYyX>es|tO&9l2-i`~RlUf74EA>8;QY=>AKgf1LpB=3mM!{$T2#*9pTdnRV!2 zCyhh_n8D{w25Un6FHrEcU#MAG*Z?sf8DE@TS?)8akDNDO86C=AnU3_5g z$27=ed@L@87akB#z&US4i$!iaI> zsK>spx+c?Dj?Lc+K&H+y#0|yUd{1S4weZF8gkK6BU+`DzI{RDMe*WS%)<6oHk_dUB zm-7=uH^=0T>|K2I1}vDMzd-eu!K*;wr$t{~k2CcxcCqt#48hv7U72*PI=AC zug~b63bLpZmM@|AtSV7|cg;>O=H(@3CS-^T^SKDU5ndgA>s1Z@+CeH`p(6J!kRjI( zh%_23JznX$g8X}h*SE}QM$o6_mS0C6W^67p8n#AWW7?oVJw5$3Xr5M^x{Af@wN?3a zNh0AoU(bYn>w>`4DlPV6K0l2!`JIm@Ul6{Ja6)L65Qw?r-N-5P<>(%&>HHozlKb=u zaf^L)TQT)1Z^IfMt5%5}04-Oc<{&FvI0PbVwgyvp`)rsHYFTOGaPG3PfoqX7a$5#`Z__Ay6Xe4YQ0MRoH!yNGa1$iUVsFTo61wDX^Z(F8|5T-H za_1gz&X-}?oKoO*-b-_SAEWzVWg&8f(@K)>vO;>&%va5j+EWjY&Y~%cMEm9fKhZ_6 zhl+S3ZmS$os_~BeYl%s;j43bAvAf{nm>EqS zkyyxwitiRQ*5&5M<7QvBdDe7Gl%TQK1_NPWROhMsp0s2`=hfvWhpyIcjP-p(PtU|e zSpWz4s*?_Lawp`)?&l%P6oYva*BV9 zCUvagCmwKd|6$y=Z2 zW1VGAh=u2S9V|1F3&N8oqQ_r*K2D|@A#LTD3V=p-kD4^Ux0Cnle%@|cckS_9YzXNMUaL zn|Ck`>aCYyd9fh1kV>R7geG~Z?STcOe8&Z1t~zeSaNtIe94<^duCOA*XM|Qh8XR2^ zq3o84?RHh3d90yO_n~j8GIF-V*1>jxk=798nNGD~2k{AdH+(CW`XLz&W1X1VV;4zt zqH23P)Su$a-CM{lY3%P}?1y}q1kS`y;7_Xy-MTjUn&Wu#@4KGx##+)CnP~xC`W*Moko&)Z`VCY|%w=9V2+p@yBcUd%6;GHmQM5aa zsh?si2n%^YqE|fDPz^_(W|gGyCz!5}T&#!MQQZb?@1Jz;drlRJkM3qtM@E8(9Qx8G z$i;8&mynosd(hNaKf@$ulirpDRbuyDYIr%R*oJ)tR)-(XF#tZThtbE|Dt1({-pSYI zs1g0d;BKmx3mt@85J>xhb-iF}pME=uGL85wIeEAsUlkdd#h(?aQj}=gXLn9LWuA|C zSIiqvH}lJ?arUv6gB7~kL|O*69s7Qm-oFW=`xr0=zm#Jzm_I>+3^5})4W7u&#NLGm z9z<5Okl)3;S?+J4Nz5o9~C zVO(=M`Bb=KQ-~~2UZ@olCw*I8&PyRW&gWr*l@9#jgAN@x5c=cMT-F^!%n&F4)!SqE z=>V$FkUP#8!A}5Rt1OVcwZ=B&KMrytmLIr_>Ay-e#5m2o?v#v=?M9cOdk}U+KK6vO zIi}750Tftt22xpi_KwC@$g}D-)#lNhFMKwN8IW7N8F`VUae{k^0}^~Bk`%~iQA=%R zVeQ!M10CjtGWeI&IGge{!A=iHcUyINvW{p6DvQJ>Dio(euJZ|Q%_mLKVI$G{Pf7a6 zO`M3kXH=r(J>u(J8Fl%8oh#&*CmC?Xj%w?JMTw&oLpWbAOO=h+he1yJij3C$jRT-g zpLU+X8Y6#umDWtPseN>3b=(ETnDN>CxMxGStLW8H{0%s$9=yX@cKl4ZOK;o;)w(PE z_RVgp9+1??p~T`cnC4;eAP@6bUA~4CpJYr|uAyyEL>n!IVl7Wr%XF*ue!zcU%j=8b zn7AP)FX1mogred%5m6|nO!37w1`iwp^LD!RQ7Pdlkztwo+jSX8llFDK+A+jy1nA^jpLB256 zgY}gfgZiPXZdWQ6LUraYo-Jb|EE{%>W_Pz;(LR%EdAyXj6%u6sJ;Nv8ekGg>w0o?9 z6@+!9Nz`9`NZlAk@=#TaJM?JHflWu@f*9^Nw8NkeS|)tQ7DxuDC;6dYy~L=I z0Tr}7Zx-!(an4W^@f<^W{Zyg~@Mt7qh zYfxt1X=k0{&=WE`t#sVZ8OVb>Y(C}m%@Ey3-}5sjD!1V#u4Dc%BaM06#cUwQ@69H+ zh!3K~117CLX14Wib<#k}evi{-#dzl@Hte!H{Mm?Q$-nf9- zGGn13D;;uG;2-bJuAPxzkm{Xv&z{ZQ>VtG71815n+pm29!_8QsRFLRO9@PpGKB}?f zp*#%{$kXKSFgDzlCMLn;c%*2NpKaBj+uWUr`-4?A!cp+(O>pa~N|6YUg9EX$sM2u7 zmp$-akJzMT)S1_BT-Z-2c z_qo74e9Q$}GV<=|PER+6m-!9T9+K86j!#mme;n9Rp%0|_E+M6xw94!90^bvt$N_of zzbBqke>|Vg99VwO=(`i)5n70Xyy*XCl?1WE1Pxntg2i5}>{AohfNYam&(K6bAiC;9 zeM%8=ibc|x(5Q60XnQ&e*AKQ(dCx&ejbH6lBu61sw^r1CWcS*JF!g*|7kjrYQ)(l=Zl6pl3MrRFjaPzN=8C z{Z7fdA*NPoh!4y|tHzq$E==3Lh7G?RVAH2?`OPL9Bsydy-SV(|D@N!Uxr8o#5F|TE zEI~72zFc89oJRHOBiNRVH28tcpY@f&soqOgYxU$z%`y$tlSE^o2rC`+`~+U3>I~nQ3}X@kzs1kd}u_W!sN?H(Bi62r>P(S9Wq0Ic)1!x9Ndw z2(eG$to~a$)sA{>j4gk(q{GNlo@Lqvo07JahH4V3>-bdFK|iY;(eH{Rd(VmS6*k82w~#dlY`bj_dD2}M59(3=iB z|8k@S2?~ktrWhZ|@iY@k?s>8WBJ;Sr4P&hTWkRndJ4eGxN*z>lyD2&}ydQ2R4<+9? zT+;1TKwJ_Yb}+A*$RpK2t`C1r)sU-ki=PX(QpXWfC-5ZULOonzhMV$SpK+T{MKvyQ zp^W5sK~_#?7O~>%YvAW-5~m8Pv#(X4X5_TyTJz~Y=oj@%^{sXP{(~k-di7}fT1REI z_s`!H^N%d}S6h5L0qukEe$5DQpNN0_@BJjrs*j-m!9V_6Kj~ksv(+ToxPDTuUYqMb zz%?auF6Dp16*TP>mI@ti>f$%)!Il*|cNe-LnFn1NtO*N-hs5O@SQ$|2?G7ldt@M19K-#3j*W{U}Y3{P|TYg=nEjAC5}bUmbKSMWHA(4au2D!WcH}msaX^8G(Ch1(7@WxUB}m zVy)wL-F?vKFG(c8ep?Xe2pOd6o+W(EOnFPd=jUl9F1klz#Ho^edx98#AsV!rZ&Fj# z5%pXMTuRgcckHva0POhP|IB)t+Wcb84|OQSLoIVdJA=M5)IViFOTF5slRNprJpC;`A zlX8?w!Iilde=7uE!f%*a%TV?=#6e^8dQz~~g8!dF_cZvE{dft#&WNoZiM{@K0H*nl z3Yx{Z7tI32&Ki^xkBO?wx53k5#Q(aceU8OLDHECDWAlA3T)_(Zy(&j!pUYI=Kz|86 z474_r`lsVG5!N!<^!|uCEj_bfRO#Je<>CW{@r9GQ%eD-eQD#`;N8L$}))4q5>FK$o zc1@5VxPgW3;SSSKci$5@aN%vU>piE`Wq z>=?3vKuW&^jhyygZlx_RJ1)jXPEHj^aLcWvh-IkebUrukFoa1pORz7HW!i%JTD=R|4@ z#}N_SnqUtHH@zwwM%;NqTY_}LU9@Tls>M=DJs}cGjj$_HRzN#2%EvonhJHucu*;`> zi)-kS?Y4XT7hjxOH%E$s_tV+Z@FVDINICP(2*=HBU{9!~4JMpjD`~#ut*2?qC~NDH zg@O+TJ^KldXNSmf{=3JCHA@SkK^~duy)LrF7=Hqa(R$b1T{oNsuXjKvlm|5T z!=qBDGXN@vmKB4rOG~J2z=bzwi5-%Dt~tT1$dNwchLA%azjdL`uq+!+tCJ=glp2H{ zobBv{wpv!0o40209=dHO$4L^%`Mp#3uNQ)Hbd^uO`~`(%-A!qa;1Pxb)W>1aI#9Vv z#jy2N=`{GLjJJ1~Je#wxJvN`6Dp4XvPFyM=rWD4vQ-y~OWfi>HUJ0lz2wPewb5G)Q z=s=bIjTDJtZyS@#Z%kK(*E}nJ1b?$ zan+bA`j^CO<0i0$DQb~_oZEEA^DJK7ukCEgbg5pQXs|>u!l?TV{ZL^vu~(5fR}fiF zl4{l2>B4)iHx^Redgln!Otw6$D+_6YsBbDO+5z?C?~`PAYPWR>g>#jKY7duAl(-gWC8|=VaB35Q2R|bp6 za{DQuTWL4Xph`)ioQZ>zbqCe5LkmVzmWL3LM{hBY*p<+qe9Q|tW5y4|%Nq5WguumJ z?AOPv7+;|)2sqPKM_wB1vmLVPNoQOLvdqS_(P!0%H0noOsP=5U*p?~Uym%y z(wPtSJ|j=7&st{Z%^b(sEWG!WF_B534O5-V=})=)i#M>oDEJj$VQeUp|8CZw4yR2; z9cNU330cY~`;;KYksb{ClA@KAEa#DeS~g|G2FIK5sJP^b(wW|s+UIU1Emz=xov)-$ zdX557L*1r*yS|jQ)e7?iGTMIDm|{bFIFx~h%Yl?uMeOh+dt6Ze2|w%*pX{ez%Cv27 zbeF?YN<6lg^7KMKjdMEUF+?)pvlctT)R!(&rcB4y(Y^&CaPJ-;jq!UWrtW`@bWV>gb-s8YNRT=~tYK|<6 zAnT9z%>*+2a|5K2yo$#G+wrJ!+fKi6Vn5b~v1#=NJ;?nAv%|~MpA)M_VC{6TTgh>{ zMeSij>QFp00r3Kl!d_cMT(hT=LaVBc>hSv%hVpo>2h|QBT+86DGm{1I6cb)fTz6Yf zRax5~t;GC4FQd!RY164y8UBkJ&V^;t*(nXIx-5y}gL8?F}xYGbfUW0bM%xz^aJ5h+fK*~4sguQ8;$FTz!C zdr&IHwQa2boZlho@QI`n`tkG>s#bm~Eu+DfT;JRfoc*$yQeRv_(MLKHeau@WOGvR( z^GAGTsP|{+R}<(@8b^HY6Z`0K`fi>J&M2u1emiX$NuKJyRcJ*23(IdbPtg<%^;DnQ zGSwn}nX`Yt8V?aEkD|H`9PfUA{z)+(6{C5J^zQn4yM!2`Lqb}_fDI!XSS*8PpO8qQ z4XO`C?1f_Mk>;&7$sT&7G(Xn?qASz{jSS!-16sq35wV^v?*dWA=p1Q1@3VeILH*Y0 zn0ozB&u)mWRer*saaAM6wZJfGar&~HA=ru+O7pLBj;W(t8T_6E>y6oEud1+dXF2t9 z#<_dSaS18#VM7!S)jpKxuNOS*edcoJL_tq6u~(_8<@n0ct+^%V7C?W2RNtKM*KH?` z<#pz|;?cSWRV_Ald7wD!J$H{1-Vqj*N=&Yr?qDHLyW%!L-7-R7{4Np$3N^Kb{rTtb zQJ`pqvB5t{o?clvHZMq%f|hlRIkHcKr)u7=ie;Bq@N~?@hC%vqh&HQYSV=qSzYIW^1NV)v^alk>oj{w%TuAtvp*ogv)}orfkrM`;A{z z=ETM2q`)(hjr~3)Qw!NObvJe0|jo;i{_E^TY3mr@~_!0b@A5& zPe`PiL*056TftJdy1`QF&aFm4B_u5nbu$C4QBJ@ohk~IPzkWsQhPdelGcKLbr1A1B zK-Q+vEi2ZJNkkN|2`M9K6_1DB;n`LnUARFm)^eIJ{>a((Y$Mc!`Adj|&vg058SK#h z^ChIdexejukuX$SX=$4WMOewnE%hQ?d$)N9uPs`U?XygsqQ1oW>%MAA)pFA)R-guC zCD9B|XPZd(r4HxTnz8-q%_9{KC~3RkRR2Ty4$$w)VW6K7ZB<(F9Y+yL_X zhg$lOVhBa7w<}0fU%^?XS(dY3TP^zs-Jt|@WV2l*5OTUhJ3zgy&z9-W&iDW2qs!2< z%+|`CiqwGFAl3`|kSS9u7uc71eB?Ne?JD7q`s{@21ay04(1q&n!wue{a?4QoD%hJ< zy##Koeh-n4^g88Mr(L9n_1bUte_hct6l#LMxecmb&#wcAyr9Z)&aKEP05S}iM6)>` zc~cyjE{j0R)?4u*?k*Dl0b;2W0Alr#EzgHsxp!>p%iX$KFE=*BHBS|CYk~9jyHF>+ zgON1)5GbbVz3S%d%ut-cuNvZC*lErOA}jHh?3Mu;teO)(L~B0m^E{~SyiuL6n+1@b zfMZ#|vjOpJI`r3%3fi`w9=YRgz`U92B-5W|sd;Z7KW4s~GzDH?Gl+LUrUNdLsgxM@ zR^^0T>(ajSS0Y7;SNu)O z;iFF#S9h~oQ$I0ciLIv0)4txT2mjSV^4VNLw1?E4pLxO#Qp5cU3H%4`2l1b z#s=~A5S0cROVoy%ddAZ`cl?7G=k%I_Mfdot!>4?y@f}Lxlcn=tg0#krnL1nXs5X#* zCfqV=h*gu>M6}7Onji5p9NmfDWksfsk^=P!aM#y521wd7zA2eL#id0PlfG0|<#)=H z4!+5*rMqtC7`sBBHwu&U%bAD4nxIlvnEp{!>WZc26w}9skpXh=`}X6J$l!hhq)fsG z;IjBvm9oHPdk|KV25Z`K!j6wUzaOtH3%o?nN@~r6LfhRg`h2DK9l1avga)+E<=Q#* zvATC+5M?x^|@6cjr=6gFv zMy-xXqA=1wiRCmj$b9bwcygTto{wZU#}H+%3a=-wRlGGcGn$gTHoLx04Aw%1S&hb~ zfGP%QQbLmQh;qL3xkG!_P5*oWo7`esuMm`~jnDn*PA2Z*Zw@K?uszRX_&QU*!|!-dx7hWTHb?eZcyw0sR^n6FTN?3aj}vs z6_wpDUsZ-?@VSo8Y#!Hm`?|Ts$1cA77MG(J8F`Eolmyg^VmVo@rb)ibX!N_IG|#q6 z&Z;4I;0}AeFj_Ys34EKMCVr`w9*~I9j7+7uUe+gP_+!gXIab+U3ac9LHBETnJmZ84 z)K_8Z#vHU_>N`$1E4Npp45j# z2ccpi7aW^kcdmu~+_$hF$h59_`m~&BB|}hftV9->6Y2{<;k1Q{K{)}KbaOh1w5j|v5A<~#zmfSxC!v#MfR4LN~C_tJMh+qO!p9^{`# zKi1OsKN*4rQR98hz@jZdZ`Xs-H*d0_B2~`+bbgPuL}5H&^2`N3lY8s)b2A^@3&FIl zoId<9>#Ns3)=53ipbVR^Bk-tIRdZaK#@UZGijPnO_Cc}3dWETxKjy3}d^YlQLea28 zb$M~!b>A#`3Rp6=Gt|LdPe`ONlfP)F0PF~##hY^B!Byx(A}851o?Q`ch}*yc5qrHL68E{4@zngYe2TdL>3{*Ym zIr|Y%T!}00zXk_4pZpt(?BP{0$#^ug{|)0uFY*X0O?Gb$4)5w42?}u{^0y6NBeQ*d zbv2Hp57f%8Fh5u?v&0I?OF!p#x~$Az0UOQ*<;7W)+)N-IjHi(t8KjgE-Jq_27X^HP z$)24dow+rrijpdE;w55aLV0^bScn$Oc-5skzFbByLMaghOvzfKg&(8M>BXyj3rZ1x zJUkrdiLiFVcTnpp1XfT_LERLs7dEM{I@_#=Yw2;v9SbU}zY&Gb;%^;lucRLu& zPWZDm*jc9kTOk;znzo)#X;Wzr%c_T@@sAADX-@*P#$gOs3b)7wD5`GMsdEe8KkP!a zEeu{Fesh5k?^Cw?xce>SHRYae5ms)yo38s009*-P*dxrw0(zh1cOF)ADaZn4yg^gG z$Bo&xeN>L??1_?}Y)gVm95Dci$<-a6g54!y03}duTfh8RF&yVt44^Mr!;X(inex^X ziQxrDgAy#Mu0D6E<+wpp|MYxrVtttau(f%PVjZBt%D>V+#pKoyNREvHQSzxir{6c{ z>w{#LI-2hjhuHLAG4&nT!6$o+Yv0rSaMq(&4DaUFg}AM!*9$Le$!tBMP`XA>kw-D~ z)ZA<#Ke4TqwC-Pt7pypKRzh+(-ReJ*Pqfu3(vu+b3!j!DhtLqvUg0mPfo%Rp5nn4W zcLljyen)SGYVKB$+xc${K`Yv1LZ%ZQK05f747+%sdIJoBdme!v7*ujT@3*8Cg6G-q z=D?^dA*0+7q!P6tazr=dK%rMLHyUp%v`f9q~^w$}y-dk9d4cp5UM z;LapA#138R+7pthVg1cxA-X^HUK^=%;rARK)kp>3+^8$ad1N!3LX^3#&3s)>+~IUh z?_L<^Ds;X)jPoM=HLK|ew+?e;^PwpIv(KML_eVlOA*}=P=jCoiA*H~3SYR6hfq=Im zuLmEmfj|UTUkC^bl~c9|`e)0hssy$T1H3D>dgHU`*P8xcBWX7WI6Y67BaQ-xhK!9z z$*VvP*Sc9(jkBmdhRuUOMEDQ;Ulm0d*DO4EbUe?7 zvGFrIDscA($Qk}!-RppK0^9Qgf9(>B4$>5UduEu%G-(rQ1JA{*z5{VplvpdJcbz8^ zSGDz>G;n0bBhtE@Y zF;8!X92e%lO;BW1o9fMvc_)^M~!Mfq)qJOv33=-=BCC_efz z2}<2E1`~i)-2;zQfD8bh6Bf|BV%*pIG+w|+w`wEXR-in?4RU?Us_4St2Tn#VaS>ea z)aB&cRu3HYS8#sNY$X3algjFY?);m)Uh8j5d;q(?wY$FmU;Ie#3BNm9?8)aSgS&w^ zWod2qJ(a+|8223$WZ082Qfd5l=XA9L>!==yB( z%$)r7LHa$Wl!)xbEO9@fFVEWae}d|c8$1%8okiH_{JtkD*QEO{0h-G^O}`_)OSc>= zx4A2;qcR{t-wSuMVk^%lGh7-G^e$2*tzDYcXBTvd4Y?!|BZ~>_B((Nu{=uB&F*k2C z(c5-qwi67}_QKt+*m}j@O)f!PmM3>}$a46c1dC(4K8(VXx4)+jeaJj*%Q^k);e|h5 zJfE`+y8efVpHSGLB0@s5pwI#>kfTdjCPj%;cm86WGa?M@2*q~4t~{wT{qFM~SN&}^ zLWi`-$iN{VXH<*cXM?Mm;a7lwHHTVz0mXe=sg>E9C|*cb2-5T*Yu~_CQc6h}i=RuA-;+!>WYrgA9^e|NdXBHA2}T zZ18I4=aFY7kfP>T;*RV4EEGdTLN2~Hn=414m%m78-gx$;2T4O_8wy`Hsn^?h)d9c+ zQMqAAQTWW-9Wwit~S)ZZoqonxtu|A!nd71N=4FU&5dD|lBu`}xg#5%h{*C^q|YL_ z9BKfuil+g}{eAKA^aA)+a=p&IdCSfe%#yo=4syZdpL+G#?o(Pj-7;g{g1SlyMmsS} zlxkis2$9OtDY~DxESIA&{F2c}nckT_1e(aCm<;_I8)IyMN%2i`e~;Adtm>ftQ1kX><9NGk z6DDt~yCclI>P9rm{!)%>MU;6XrDV*@9I^A*0M|D<)>TW!ZA9{Td=h$}wh%^ZBr~wE z6(sKPSl-~BxZ@^9>^NQPqpk`=zE)K}(`ugY%Z*!J=a%+i|vZZk&YREBV z=Cfdl5`8mI?>cRk#6l>qNO_j0Ii;UCk@W7@7<|gRg zk`NEf3;S(q{+z6N0vu}P(zVWbS>ZUQ(>5C38}ZK@6)C+(-P`*tX(sZ zp*?LgaWlm7EZ3G$&iu?-#`3ry@c`vWM}Wz!vM-MCNhUUB>)R@4AH4R(vZWtHuCM7eYwk%QHTUJhw5 zxdKh|UTPgj8ecAQ!^1c>e1Q)R2M!vGm8)&Z6y11TB8H2gBT{Am5!)L?mP)is6#kT9hau0lnR-Qx2N}qhS09; zAo^T0mYx-E+am>=_L4D=CFd_;izM)VZ2X)%2q>Q-_CCj(uEiy6pLS;US%jlqPyuT8 zj=piH_AzB{-r#+=TAiPu6pd7m1@+uHjvkMUw5!&a?x{&#N=?nCZ;!$T;?{*WcSS16A;nAcVcRSWIPd3c2+wV#fnfM8Rq}Y+kqht{K zvTW+%&c#s4+gLBFrXw5&Z({U&>6s$9xp^NEC3cci(oXFxK6{i-)h@CPEg(M{*~2Zj ztxR2BAu@)`xOqf2J*fTzTT@Lglw$LwRpD;2-h5<&>1Es-vx(co`9AS)w3(qHLuSrT zyLYp`s1QE2l2hnJX9+DK3RilCNjzQO)mdX&X3jc}G)f?5+*G-n75ofjd+YOl>Mp22 z^V6Z~=D&Ea@H@(qs^XiM*v#sqKWR7*)_PVNvuaye2}bi>1TmhP-xV@R1Gri0{FMR^ z8vPJ~cR}*Ve$Rg9GS?f=I>orcC@7(&q&e^DNl2@at1wt%lXfKVN>WVIQ|G&OvCZn8 ziAe>m$wwyXI%Z~aiS&}mAwMZ(Z}-?qc2yboG3oS7&nLE--$hgkd_k$Z%o|ni24zY% zBb&4Xmb_J+i`~=7E${kBG6^a4V>5wc?N=msQ&t@AB_E;mkp`IW;kIZk6NU{6HM&Oy zU4)gA^t9KY_H1)P1 z*5VqBd9vsEedM8Al--rL0&|UL9YQBN`nv7UpHdZ1LE(711gPs+Bs=IJz90OsgmdQ} z_nMx7KY8=)c*n~vZ;;mD)~9tdG7@YSyVO>wyexOeG6$3+H;|l4G(axsF@s>VE6qnt zc$AI^_Hns9mw4m)M}xYtX!E_=MbsrC_pw_MIhayo$LLP;DT3uL0rR`4!c&Pg4Pil} za0B)3xRb1)F0^0!{j!Y8m~LZLkYQA-jOq*aVZwuGnQCdug*(if^_$!yRm0QBSzXmB zBYRG=OH7JT;wCvGo}Oot8w`uIPRfhFl4M4ybY!RboDDgoLR@%P=ZQ3LjtCjfAB5G? znVpiY3A6)CmCIhrHRsK2`sUBy%A2hZJ0ccGV#5oX;`mGx#E zEzJ|HHf3l(w!eEDo9BmvM#&PD^_Bm?c%fZCJF0tuNAkuCMq@kte4yRDgKAJ!&<98@ zughIXD+w?|EHL71HWAt~6&B8hwk3}CIJRnaYP+gaudplhoMdDwiiJICE&2U;nd}CQ zhx;)8mkT6nM&1Pv8(xk(rQSwN=C=I7)>i7&8CD;nH|I}#C8iy@6Gtc)OQJY~vLVZ_ z1+-8E!5!5Lp=V~YO!CsSEO4mVF5F`^1uAvK59_LrW6OJjmfpk-kT@lw>d+x@XCBHi zq|zU!#WpnvhOuK$p}Xt!h^T~T&r)33<;N6;O{z4+dizF*m4ie_*8$1TyCjOS3mtcb z!NVDcZR@YprOlu%M;!=xX{?5ZC0MugW3?&id+-HRG%@@I( zaT%f5B3`|wBsHjtG9N&sMRddljE~oLb5Dzt>rz)8`UZx`Vp=|GISoW>bx7?RJ=1Dn z1uSNYFBP|qv#+yBM(y?0dqd&R#2NQ60 zuf`?g8S`E}4ZZU+YUOJ7P`X%?chIA#bX?Dv95I_%8d-t7@FPN>yr?yYS4Q)PLG*;P=4&EgYpZSI5SE*KJLd< zY|0)K<1PCZsNkzPNOJyc(sCy-ghiw{=EMBrMcY`fE}1E4eLL$OB299)gMiClgaQ_u5NM#ap z$)e-6FjJI@*p%R95|+`@nkEaf8x=SxI(x7uPC){RGt-b)rRKUs2*~*)Yb&KyZyQR3 z?Kq)j2Ai9g8bZ?`LI;bu+D~b{mC;J6korD2bJV_Xe)gh{nyaj#EAu#dsHhT6OLd4? zdj%0Kx~<8_oB7OLKgayMZ`P5et|)3UomOkJ7wK5Cyz6u5(PIWuuK+U=qvg+Wq|85J z;roY3bqulAoF(GW+;N;hZcSH2?xH{iFDIG`zwYWINmbvBs`!A!#EDi7$yL&5Pzg9P*QRB(d>8&}96xg$7r}Z=p6ZY+Nz!mqpJxPhyfR^H zCrD@+xwkMpacKWZ6*shdubVY7ax%`Ekd}WX4Kp3QybaEb@X|38QU!m%Z)Y z2yldqrRm&*y;pHcx9gl2{{*{#wmq$Pg;CU2m=%FItt}dgI1W0;ur20~#vL)ty58Cz z=L=AVA`$q73ip8I;u+XYu4l2&-mJ@EyhmwZe4!K~SlV-4+F9*`pTuQL+udAo8z^`6 zFe+Ih5wLcYdBN()P_`sw0yRn!$Nx5zp+j1lJ08}?$_Wn?Z7^j{@60HU zs|AqGNu4X+fmGM9LMqd{$XfpKnV&}zRq{i#%PM1@g}w$fl&9=);6zQ61G;0XVLV9c zYVVyi*xB>nO#^U*GkQPtayT6+>{m7KE9N<|wISE4Eenp#HZ&qi)xoLC)~Ady83d*S zJ#yc&HJsIiMiVhxVEv=-c(#Nhh3DlWWoR-t^ z7RBl4UObZo6a8&_zR1{El~D>E2Nl{B=lc$q;$uVps?lp@`f&KRd8EI^)ACKb5}PQBN+f7TnL6yR%_G_o1kL@lkK{7 zhh4oos;N?RMUtYLd`D!jUo})1eCP~=B08F9OWF|Q*qlFC<2=pnVWFE&;4-VXskE{L zntK|V+^aR~KkUqGh0rsJ+1|?BJT2uVZocAuLVK4ue&?+{BGI;*lb7kw-0O18w0+&k zOEaqHbK;pp()Y-1NzA*e1HC>;mKgR`#*R{$YeQOI9F{u!1vWNyZVI#tJD$JCIkfO) zv|K^%%sZNH>WpET=@5aqJpy4FSUMvX`!Jba5{L*Jb^|M|u2%kj=V8}D|9IV}!Z#_7 z+gOg(VufuI0_-hF$xGGJ_pimq(|Ucd4P;W%0*;}b=^941)R9P*ADjDGNrcp{bId(s zSVQbkeQ)KX;IF$N-ti8_*z}pHmFP20Iu(M^3O1t+Ka48iWI{Rf?OjDf#m8_vGwEW0 zU?|ElZ1pj54c8o>Q?m4N4HmY0W@-d-@ZzDW-1#P~zQ6bEasJm==l6SuG9Q6rAUF}&-p`xv%Q>*JSbhp za3yqe)IM5+|4xa&3!4s=9C~q5PQK2#r4wHLCa#8q39T|YN2XuG7N~3di-^K%TV(?8 z=!1Bx6M3xAlgAmQ-N>HU&n9iX1u0?JniNVmSD@Rfli2vd;{k1j96GZZwm~b(1cV!*4;O%cD%agFb z=FiObS{89K?<6x2y{?N&2@e_S?8~zpas|DHUje;N{JF3p>p3{1{nKh#fUXY1ujiAP z;t&m_P-4fKt-LAfR&IKiYo9z3FXcDjAQee`IpN$E~U;@YW5mUm%2hW(}pzrvjrqP zzbq*sbHcPH4Hg9BJTe}W#LVjzlLYINkJVl!Sz;HE0`)gxn|uT?`cZ##LERyt9M~7{ zc-12B>fzMUmrr2s{%f2|NN+I}Plg6f*pEX~LRE2C4G%K=b-t&*2Ii79W+%M;2voiF z#bUovQ!89(ak9#3S0r17?OM3P)!;&P7HRT6TK~AMtXHU5S?|NH7E86+4fVaA{qF~O zr`O*F{NEj5_kp@l_KC6lC0fz^3nCq-v^3n7cTvV|$45~!f>)up>Y?KFl17;8Rd+qo zttO{i13X4UjzpiKKPyhkO#5>Bj4kYTW@4`Y?vg98#-8L>wiTzo3(0P=Bw}2bjYtDw z?$W_KecPRuf*^~6l=w6hjWocDpzW5>3vD&(n6HZON_YFBYy9NHxZt zDO7nXw0)X9?kH=DU zbl1h@aV;9$>>(=>Nfhlld^zKi(X*)4_Q?cT08I>%IF%H?LhrofV5xs%DdREU!+UJz zzXhl`DGgT*=JYsuY0|J{#x*D2L(PZ>{;~}PvG*sj1AP91RDX(Ban$)MtgvnwyZF(Y z-7mVQ`u>vH^DmjFdDpvp<2ru}p@pa!xk;KCWP56`&XLkfzrpUI3eWJJ`-++EmP4Ze zP1r}ZS6scu8WFn?_|_oj_gVqc`N-xj5(RwpS8?>lnOXn&oqY~}bFO*x-7YNMp7<7& zI^#|F3%$jh=~l6qjydt!?i3QoJ%7TA zLqW!;-YtH1qmTIz~( zVLM91!E3q7x|2p+$;|um%r{X06|hzP5NF;NbAOMPA4FT;X&SUsOFs|u&`^!+n636U zUoQ$GFJTB{I|@I+S?Tq^qeVc_a7bKc%fDcmrJ{IF$(K_ROGtjJIgYFoaAS>F{W3jjZ07ec`84?H z^PM`bti?y_Oa0G7Few;so#?7?2;n{=MMhT&=2xOSDt6Ke!3mu)$99bsQojC!zRJ}g z5m*@ce2vIHcgEd2I59OQxzm3(W<8W~pj5(ZETK-u!{I6lLRWjF#^AM+7=6cTBN>|# z*snq1G5_w@m<@y>n*fdv)u-wdBD5bN$is~;URotZZt z5=u+in=)icskU;@D7mO($f#U*&7H@CT;uH(Vf|g+x;`BXQp8H)QfMCqR;D~WExvL_ z)^hHMl^PE z_T_KduC8H_aw$n=KPC@YX3l6AR%AjMsKGp&G{CA2;TVk$nzWFAYCvXUdm2O&kZnl~ za7w|4hk?|FCCLx?ZhNtP>!xx|ye75uv6h$yHhaW*zBjFUo?2QYPD`D6UMcyyFSte5 z83`P#o78IkrmZXj2_CvZvpzPIu zI7DY@6u@*ni{~}daHWA#HQV4An#;c4KCaiph50hnWhb7l;-`4ip*M&`u*mG@xxq;Gc7GYFLt&vCHueCub*ebx3idZ{NC=@!#l14*}I@}n0uuBrIxQ;DqBo3B?6A$IIdd>H} z+~w}v=M}nAkdd6~S6a!o3^1E7?t_Bzitoh5?xlE{ef4BsVOI4Dp_VCwnMRknzlZx{ z-dfcpSc;KSFk?ta3?Zim=Bl?Hc0_EEF*o~{tI^WNP`o-g!psM$_%>(A@>I8oA3J$b zM~rX`&I4)Xb7IKeajEkpj#t|$CMDL|op@p)w!;&?3JNQgS5q=K-Fon~$I zX!@3Kx%I(nwqapjZWW1)@atr5OD_wF=RYrTql=}aR!^in6hx%i-KCIJ3-}a8T|Lq& zp*Fog0LSod3(e+EVJdntwz!18w7%m=f|i>eM#f*Gx+Z#!KJ_qz1fi>83BF`EYRGqH zG#7@TJDw@btP=N%uNl)lQZP*(vOWkI-&)+#$N~R;@=#aU)R6#Tt$QAVA@E8`PK}*J zqO00WRai&Ar98Ax36(SIk*c~F@b!n@Nv*G)h$|tq!n%U$`&0E^Oe-DMc~kwvM^>UM zkj0YWhXyrV{52`c6xpSmHGh1r_Z?}QbQ*a8^~sbR+tbvU*Ui{cF~R*>>P#PNPN@pN zuQBjfC;6*#9Fr5Ir9zI$v@^5lEfpWK0$KFXsCEv@RTfEPShCV9;z7ijXD$gjwD2E# zvF6jdjQ6zjH~YOtM_l_fmbiOR?|btj2%&Vpo`e27_xRT4a2Ov(saIXSn522}3}5dt z>413h%J)Qzo0X|(aTl(7kLx=x)lAlwulzT)dnLe;+=G(hu+>r_bFZPzNeAw-3gRum zRca8^#=aw!rr?|D$F7V#cJg*tPv{#W?f8%06_2RX3VAQ{G2-|mrM>p2VWUJGlf!NG zHLn6tfb?QmY!OnrOao70sAj?>1kzFt`(}mt}&@ zqRcmUw?>Zd&waufcXoy_Q@zLeuzTh&R4K8@{?8LeLZPEVvG0N3-DU4oQZfsjekDfP~kd{REInr#f zS8z+}UJzN3Qv?%9ZyQB#970(lW$78rq(9>9I(TqjUYrS5wmYd(IL`dg+BDhHsZ5OQ zAzhU>a;hj~iT4L71J!#nRre|D1#Uts+;=MQI#ddWtoU%4`PGeb^dLql3bwsGo;l>x zcgzWUnLAEje&^C?7<;;7%hQv+Ms49mLs((zE1vr=TRIWHr|x!`T1lPCkQIzn`QLyc zCHTl;>0^Ma?Zz$p(A<94LUtAKD!8^{^{lNbT$^&9LXQrRPBtkYn^x~s6 zRgE&zuFf87JGHwk6C~~oD-Olt-Ej%{(R3VzEH%-W5^yY_Xdl$ATmV1jbPi6*mFM`} zxOrzr)tQhemXI1_A9}$OId9DpD}-GJfsA2pFXkJ9{u8-EZ+3_9iU*ToU&c$-r*+W9 zHS}C7vfj?=LHw2wJRKsK=?MuR;VMC(YzZ)&78P{15eMbK?{#lvs1CpChd0%eyK0F5iC3s6939@+IY@V49I>&JQ`IJ!$sFsrTnfAl0wmX^v4>rW~xo*kSX2)0TZTIvD0)*o{hyt5NSM)j$&yU6-xV(!Izm(3Yf!?Ewi~K&! z*CdCD+4<_aJ12(m{nWJE#l?(}rG5zJ#AR3F-1Dm1gj4UsphR*$zR&*?vW}#Kcc9=6 zJ71dFYGAIP01)(O!B4KuGrUdRc~@1mcZacmg~Oo5q2ef8SJ`>KZ1(!nf2Kj@rE0uc z%Zt|WhryZcr^JqP?E2tFwq(d&7W23mwVV#i6s;^7-&n)eme#rOhL3{(((8jJmsW~S z8)V_jCNDq1H&S<`tu8EU+DfMM_O`Ufl*L0iUM*ei*@1QG)xBIPi&bS+(>d{d#L9&~ zGIluq4M<7qV|r!rE#tpAGHEs;a~Jo2vs-enyISjua zWyF#K_!RS{7^Wv1DU>C3YkD*Hk}YdSA^saMcAKA{c(M>OZCOK|BgX1!ny^BJ9TLct zQgJOc`F28j%A6l;yxb<-{Jjq(^?!=_a{A|jh`lQVjXO@R-rP0cpr4?lW^C^;?0A_x zcV7;ZQ&-h1(bmH93&AVH`PYctR7_7@$&I2{{|coJI?bK0exLAi)F1?3eMYuy*3nvB zlfR_GWk=7xoEbgyt)8=>sn>5~U}3dO=(aHB#@R36T6yGCqzJHmSWp3a15Oee&ay_0 zp?>n{L*xPrA>_DU!hyOR#fbOT@yk;MNa>W2;f&zkoKs37zn20t54H7{Q9_k(J@RHt z!F3x#ySAf44!_*QieX*5;;nh6t%?nYON5R_|Mkv#6_G>gJ`b=g(-U(87lefF|Me?c z%Xsj7w@97{)(P60{nXY>H>Jj-JI>#ma33l2jDNQw^v`h5Fl#&mjnZso1@9U8Fkm2b zJ4bMR2-3w5kC8%77AHq?EH%wz#oMHByXFY;h2J&)4Riuu%GW zsLRx^gGO^CuGrH+XTpCeK}LPhCHFI7{X4sbPNjY7Vuz7tlk#Nj{p|+*H#ZB}{vOFZ z3>Pr}WbwmTQj8(1gT)>^zH&sf7pcj-z0CkdL-3=yaevYc4R{c${Rr)GN|P?lrU}RM_FVga<~eiTB^={YQZ%{%l$PXiN5Af#iA>KEXr%jWbY>$4YS1+1GtF)%q(Fz@4@ba)8^qvLxX1rn))7; zS#Sf9xAEh*qn|C@6pii{lnx{Pz$8NLUR}t%dXxD<{UnOo;+^5JhtQSEVW$Va7p#@| zlbTV*{l7tAj^h%c{Nejif@1>|=Z zTO!w-X7vw599XPVjL=w&P*^jWUAwS~n_V;0OIkiX8~h+aG3)&5sl~jewQ1x!vQO+L z4HlS!V=GQkz=}pX#{6x~m0102x~IbN_ZY2{8M<9| zsZGmY4ks?786)OHb;p__i6!T)l`f);0yBVIz$VK&veOJM?x#}E`377RHR(BQL*{b& z`1^bU&tzp=<#BfoJo}-b{A#K3X2K4`YEPhf6cW0We(UUmwVYTV`(`2`de2$_@l3s& z$*Nn__WgBP%belqgAs|#E5oviYcwN$KHcuaIuY1HlmYai=eJQgLW46VYlngIAJ=@E z(^~v@k+gtKmHE@ldtP4w`wRd6q=NFEi5*2Y3Z0TZvo%p%!}R4+b-cdisvLVZZw9U( zyK4|ADH_ean{@~X5`dh7JAVMrGPO~rlFs@1cJ`%P*I(2y*;Y82RO@^$!@{Hc5Ps%? z^0$*kEQj29tk8)spFO^2;s3jm7|;W}I<=_ZJiK-^CoQqeGne>7^j>A_{7c!vRr&VZ zwR7BJ;u@-HTA6>ZNq>nmxXSs|nT0)ZPa3$s3KBzQ#@4lA#sTHj?A9WrBkY#I-96t} z?{3;6M=dy5_fSbJihJk?HO<)VGD&k7Y`IIZrdh3wwR&O|>|ML#S0G%m%Qm(-M2$OV zqij=XO;!f_zS=@#->8co{-sKF<;K#Cni$|-`6}NtcN6{96$vFcZV>NdP{ygj+5BLE zaG7x9lV&r}{E$SwgHIA5fIe0QR2S{j%k3(EBJ_w)OI7s!lVS|``6&&$b>A!1VGw~+ z)&*w%xZ^A__5P&1@|Oj+#FtU5>kr|Z6-l+Myj#E(`I`ew+CDd3t88PQlxVc|;DoZs zqk1`*p`O4lD2g!tSd*G%_4`2NZ6)WEm0gwhO_(GDu}Yw|v@MOd3+Q*6r%yE3!?kut zlYGvUEz5e|%z3cGNLixbRCaLb_`qRh=lt8*XByU$@}FlXn5;xRG(Q-zTvNAHQ}S+| z@zV^GU)ue6(#1&vLw%@H_q{Q?X4y?L;Iz`Qa`!B_{lY=OZ_gEpJeq&Mxe^c7az+a#rYkjGda(xo0iayd`_K z?zlTc3TUpdjl2YBsf+mB!(SGJ6Ez2bk#zatpFsWue3ugs-46S;4vrxFYcd{S|=y z%+1XY*5plBcWW%VqtX8oRp6S5iXP(n?4y>Wr*F_{*|R_z!>s0RX{}25X{D!*Z}(4( zv>_Y>VaQ}$?s`?-if$e9>`94H`)OG`$7=iUej^Y@NNV!8X2s!R-u&4vVf+Z&!V?A(jTQ|Y^=*2NaaDxonv4BklxtD>- zX}{v&t#LzzH5x!h{`ikEhKnv>79Rm4J0Tka2Ob{%(K1XzKZzWuj))BH?`LN?3yAW9 zQg^gP{io1Njfkr^An)lh<=#yz`zBX&Xxvn5gw|FhXW3E{^^55u?J5)?A6C}?&K<<; zQQl#=C=FgbHy(yRYU^v_Cb+nArD)9}M^aQ&+F#&D{1io1xyOJ+nY*IJFB<_=;Qy<4 zjjy@dFc`8SpR6fR2wG9GCSbSz0=s2etg6MOrg2QKWEx@PH^DXi&8bGkQTcju&mQIb zJcv2Fh;awk?r364@NBG!O7!0hS_HTqP}w)}&0I-#!KK{z{7boJa9~Rt1~~B=aF)H2 zVnpQA+oGd)fyCbtSoOpNoq`+LpWWRq&RnNC1IYsyK^&adv`z-Cg%~zdi0o;QbWyXnh$>VS`?&Ws$ z`D()O8A~7*+m;H8mhTAKgcw7BDyfk_ELFeq9=QXHBFy!*vvEEULKxToi|T}~|F@{J z^>d{`hg>xWRsHXR!tCg>%dcfOV6?rnd z+S~*ndfr;I2^`qP$}MtGS{(5f2o`e=#TX0#N%SGj#QvY`4tL%GJ#9X+!*J3-RRSo) zopqZ}K!AKH^H$fl)Gj+f%LCTw76hPURQ@E((q*>(@*DkI8)LT^76pv9W-uG{yq-cN zS?HMzROMeK`hHcV3E|K?esiyJadLjp1GdtL&i&@kELUHo@XUzbg#1E_+(bzehr`V} z{V-~`1K;(*G(@!G@C}$zz{Fk5lG)(0s+*`B*GH84MyakQ|CR?qPv%L&xfN%k+v%=_ zspThibHS^Bk4c`oX>{#RM$3ubKOv_4<;c$#wNb5_@gr>zD};PeZT5hDusXiX?yHL^ zaONXtEW^taY!n_?G;j5Pvk;924s=^E+W4}5KfnmPMOo}kb>ii-s{0G}TMB$3_;5rK zFIW(o=yO)Z*4_`Ue2OVlfde{X_yVRVAx&VzVuaB%Xt_4iMK@7W8ictY_BT78E6y7J z<42pXv@48ruS-5CvrmL6bjiOIp>7AJx*OKbXodBruM3)XIEwJxjXz}lPE@mD(1_fJ z9Ne}Gz`Tb4VNz|GS_cQ@Uaxau4wxf6kp(Q%++&PnTP%P-|9;BonE+ySQ+@WW%?_+n z;IU|yjPw3G4=}C$H|Xh`^o6nUArJn)7nXtHjM?ss|6^~~Cm7*B5y0Hw<|oJ6Wwp9~ zO9sH}PuxPB-P(yPXl~&9B+eI;rbO>wfj*zTtdag`n}v+M-f!M?7M7n#UK-nAbishVZ1hlCaW+ZPS?2!neN zgyJ|&!sye(Up~@I7Q}$;@=3)$$Ck={m>z7|L}^f}_ipRTj2ZjQFt_+4X$fWa*0tsN zw;^f$4A|al|*bh?Ww?={eE*)Lm9H%JHU_#!w+8w$o?Z^)ZIy; zwSGW1H<)Ux}Q0eiDr9{=ZC^R@1@9goh2X5pICUjVbivE700m#9DeSN?{ znRU$vxZ`^zjlmYYZM~l~TrAxMV_x+2;LXH(B5=(^(9U(yQg$6TN#b2?;RmX&oKKFH z;)gCcWsi4dt#UWST_WTz*Q>|*!3yB7_p$_Y++J0f{R)P?T1h0s0O2x4vMH~Wen2RK zU79pSVdaayzYIa(g2;r)qdB!KDvI9mf<4x6GyJCo@5m#%Y`jpjA7LD6Ovhm37PLSsLD zV&Mi$7Pf60$vlgZ{-$a2LTw6&F?R89V%UZgc#;o%&1#P<#v9bfBIfbXt^6ReuU9}CCp;vj<1gU&Q{doR%D%+BX<_Ku>j6=>=4#qc?)Gn z#74XRRCd1=F$n~4!5}JA;g^tS$NC?SVzcv{?+N1ML6S1 z(v!r%eI=e(di?HQ?YmJuG<&;p3$dTj|Jwhn*|s>eCh!Jc+9fIh%ar$w+^7CZTS_wpI+nS2KyzCMO#Be>#NQ5uyXD z?GyFCds+>FYc)_AA@tA*iGhjUo31&5qcyxb;O^8l(LQ$KPr82iv;8jPBER{MF92+I~th}hN^sHF1>7J-3`seiT-$7TXOF9^vL`gi07mDjcg=-ok_nzI#mpyT-*rW01*alf_E0lt<(v~e|zmJLrwQvWT zD(!P&-J}yUTarBiuq@^jCaYhQR%Wb_HRnV8lm}J-xr`nzC{CMLALZW}UV_LCdCa7k z2T8K;*g#HAE%aP+K@UU{DM0_qzq}!70Bmc^nui~5+L9w{)+|`?&krAos(tqoalM6} zzz_WU8A7x{wMFjM4IwXF6p0R3UW`%@iw#$xL*Av-*zG3C4Go0ET{Y3opttY>9&15l zAkI#Szvu5QB`C<6fq)jo%a-9D1la$si+&1uhG5YVxJ_Lz{A2~R(v9%XAV0UGz)m_8 z_0yIw5HAy04%|BkbMc)lFYz2fEt+=BG^(Geo#Ui;=@)dqiWx zk6V@+4?0xf?dW|EASDBCc#+vbIYS?~On1#T-Wd-0)b+NspsoL1l3wHyMacbuiuXG% zLd;uA4nMwzI=3p|*z$73etc+o`ih0;+;J{K(bzda>>lAWj2=en7CjLsK0#JxU zV3+I|p+Gkui8u;789;DeV3F&lC+VtmadWi=_v+v0S=~bcS+E?wAaD@{cIi0grWY1= zabI+;UK0@G8iWIhR@!$jfz0>Ejmd);soFu{YVQLdDH1W|z{Etoi&+LFW41WH<2Zc*o z5woQU1It-ii>-xg?D8yXb@K8wu}-Ye6x#a}aLoR$%zL2!eC&uXHP{pSF3w>0u&tAN z#X;P40|gX$`5owHOLRG7D6IPoef*DKsh-R6sZQ_S3K;eL2kujze)dWoaPgA3k?!I6_8gJu z#2lZWoC0oU8<+M{(ZusgJux3p~IqS#cGB)2XgsUeniF@dtm>(r< zM>h-a{*|N!Co&BzpTaW*78O5NuX;vmUOKS;jFDVPfzXFoq<;WD zh{fU08tx@sM-|i8xyZXl^{53@K zvQr7o@?z16;+j1q2cU?tUmB<5ZrQ0^ij8eJ&wk~&gFncfmdLV9DSdy(18$OU&urAl z43HF>6!fgoH0v^%Of;6$Bc_0?l5EhF_icKjiqsk{O?OU33EELsoRA*AmYA#)j)Q# zsM1#C-0;&NLqok+RqjvD5RO0dWI#8SM0l}_MXNk1tKsF%H)0#B+q8BH1tKb-PbG5`92Qwx4+DhvO91w2A9$zE22$wBWtFBDbI zx0+4wyo4n##T4iDjvsv6>q}w`u?Ge(q~;jMzqe8-Pq1Xa!o7oQpi@?EFkZToPL^ec z%i>MBv7-qc1cn zXqk|vSHGdxG1ks2B!+?Si5zyp0hH!z5{;?zUL8&2kKenWdQXanzIk0e!FQBz4YYV+ zWSR1wyA2g1-_E&q8*-LoCD@i@m)JA*D^eQq&9S_7)1<_|XXle`o?i8?@@xtsu|tR{ zrL{t>M#z*RYC79GsYEOIe;;Uo^I3M>OZ_&}Z!;901;;UlBzZq-pFIJvIC6|>Ke@J# zR3g6n>Vgzm^$~r=3pLM~&)F_{m7&(}XrlxC=iRB};Y4_H%n!UY3cL{pw$gW4s;1TOnm$K^Ef z(I2N^<VjU+HKe~B-|Tm7ad!t1ovOQXN+@ouNyL^2vO;C)TmhC!+DI64nvsN z!5*F9BAd|m)bG7LF!owAht9J#al7B`*SpuM3TqGrds zky-VnQWD1hR5`LclcY79ebQIbw8iumdj@qeTdAzL#Bh%6MZ^}2Nz62sHh6VBYB+`~ zHbl6$kH-^Mi<0EWauo|}-8bb!t=_BS#W&`pjyM`C81;82E#24auk8P_ay0ggWk5!P z_MVz?@68KbXsEHl zlNw99fAl8D?KgT7R5$6=5mGsC4uq<2_lnaPwFXvUKGLB(QQC}SLN9g+(6D+!N1L?U z_K9))3-SK(q_l~tmUu=Uc#X!MS-6LHUY9$Osw6NB89RN_4d481KOrn+Je~Oe%jdEsQz>h zLPdYKJE?SDxcZunm(MPzz0z}yxZX-({0(c$kPoU>)R&So)D<8@QccY~mQrnIf?|$0 z@;MX9d)|8FCXkjU#EpPbcHCZ`e$0t8AmAf!JXOh+N4QN10LPhX28Cp4O3<24OQQSqTyP!1de*&>SM2#oJcgU(Ac=Mu6#UD3Qp3QgxH!#+e``09it$C&GPaTB4QNltbR8WO zuBF$1n`7<(AeeyjnUOXQ=O^fkB#C7ycA$q_$JP0t9(@d zPvnL^qv1Pesu!L!kaidkd(8HQhBTJ)f5qLR?ygfeped9U+lgtJ3gm^DU1~lEW%%Y(4gl)$4xEtQ6TGqWmaUxb8X%9lSe-XODz5|{G7kB(N zrY;gKH|Eo#p@cwid4u2la+o8A{B6{gdZx4xO)yY|wA!4#2;dOzea~op>1SJ%*2*Zd zR*K$dnCPwVnBZG4IX@m^z~D8@4IK^8lOMyjh3?bJ!;SG{@rDf@USpZYSze=gHdsf- zb;O^zlj0hevr>A=!cDDU4ditT+z1_a>yiovj862;JMG1P6J4ZOm_ZuXw!!r}@b01> z7=`_LzbJ|mdaSc(D5|2*>D?(z7U9gDo_09I)YO@pG6@p21mbyp#oy^xD* zx!=z{w2s3M;dW)b!I;sZzH4n>k#bQ_rPvJ6WkqqLqYCq9&+>T1PSr>v6V1Ou?!vn3 z!{EPF$7Z-4QQv{4j_tY4{zuT#-QVp)etK1wux-ba!(UXm*PkK8A4=`df+&$uf%fY=4JLDJYG}s|>oQq00mt2@U zRQGvcUWTy*C0y)MjQYiE+pB<&f zR~NBzOZRmzDG@lfsv;A=5wA(_SfK`f;Sm%6T6|?D-i9WkB`b$LQkLb2=-J1YJ_-g> zY-OvnVDO*%*1$giZ!M&|?Z0Y-fai)B1DpAH3>b&B`S7xOIBJZ7{SUjw`o~2~jf&%OjpQVnK0vmBa7L)(+W`)@s zzouqx`%%6O){jJN)d3Z;XyS@~nK>O(q@^zhD@A;Qh_+XIdE@@|q0?YL|7(iGVc?C0 z|IOeT$n0|LniJZSQJw@8o(kmwS1b?60bxAfk=`xi1`o+2041uGrzApE?aZT()iva6 zX!{f%F@fI!sMvSidpj4A#3))#-F)ev_J)86sNy^!Z8FI~mM_O{^2)OveqDH=aInIN z`C51-+SN&<5M`o3-Llii3tfc6s)kNZFDmi zkFbJ2(CZ0nz94KX?A_=UdU#XLNxK)6gl^Z(a%0xq<+k?U zJg&g)c!HK@rC=NY#E|H*q12s}6{?DjN7D{zC$V_xWcLOQ@mGO@=xDsH90`RBstX3Z zOs~i$IYhA(1i)W7u08MnC~ZD7b_&D&RxOWbPh64=Ts?P_@-7g%S=~vF zd*9wWpdH~#)Ep!LhilYhLScW!Lh<*)D@CF2X7@M*6F&QXkIT>#&n%|~o#RzhybeHT zXaz(AV^ZRN;RZxZW$#NxL8E>}9HX1@h>vBswBS73?vuB^dnt&H#5LWGqH}qp&d?eS zMUZq+qu;AVfGO&}Ud8mtc}^yqfen-NFmI#V+Px4z?lT*kabmN6eYNXw*0>2cABFSWjj z8fDLHXPx($vKs1?0EcQ=#BAh! z)v0!%tYm#CEny+xK-BW`aa`z1HP#o>T@N>@p_Ryz{fH*n0r2d%y&1GHq)}MW=Fu9R zIZvs1xBc94F<($ue6(c4Iy4H~P=pm;{FwMstZoIEOqY(Y7Mb~ZoKdd17GWlKeQf|C zb<&eDXGRD!)~Tbvf{S*vs;L#yJt%;;k1dikzw3ngr;FNspwB?&q5|T>aa>)`>jCv? z^91(0`>_&3!Yd3unNho1HBK(OFic69(!{O0drYYxj{p>neOz~3o5i{1^@SnnQP}{( zK|dA6_0}=VWCb`ZwFFpJ7OqkAyuF<&?Q%2-pMnV4oN>{m0a>|jw z10Ptpa9V!snIE|u4lns|_2+vBo^yPw3AUII z5c9=9dq_rl=oPE{rGc!q0AQH;+u~FPD-s-clpn`jx5K}hn!paHM`zZ(1$`FTA%PHH zTORdsiRjl2WTauXX(i*73ipOU;ZGCq@x~hTU|E8ryg%;@AvWaK7ktwt`F&=(@}qb- z11mAH8$3IyQniR%{OB(W2$jQLHpGoS(V$|5UaiL>ALlV-qbd*LhMeWK^RCJO)>1=% zNDFEkQQqy`ER50qLu$V;j~9(jQ=b=i$PxIkn_TTl={JH`1LtvJ!}f`kWQOj~*j=|z zt!qJ8U|En1VsQIUF~^U5Ay{EHd3lKNaugG8cCl&>8s;vXeu19uUvTg4jY_wo4&8!D z!|sAfv_T64WUI%=fQT&9!|M=)cCpPH9`OgXK$f`1Xkl0smp z-59l7JRztnoN<;p#S8CnSdp&w0l1rf!vS5l!WU2|eCL{uIu(;FNG}|Zmv4v5doUc| zUqIc%m|C#0^iTe#rO;nf0Ba8m^CPEW%UB$<={&;P|{`T3 z3|S)NpqY#6{T0~jhI&GUR2K2|VS3;@^iWJ+X?F< z?l)YW?H40;*7QVG&e#(T?X88~r_#l_}Hrb>sv~IoewtF^{PLU(!GfC{cbXcMauFX>R?P zg*w$f88B>Lc<3CFOxn+&_wj;GK<8{kf$KnU85QmAat>8-c z9W$#v{ZdS3`yJXS>Xz>tY(3#%Lu3JpK;T`ftd9;eh;~uf?|LV9Cx-}EleScR%$o|hWA1?fw z8xWyJad-Y+v%@Zx=77(>{Xyyyt` zINUM|+a0rg_jj)_y(ZzPcdsY;6s=)$R^aO4PcMb@yvU5|$eX{@wDf06nYSc%W;B%< z4sZKzLrQ23nW!IbQRZrwoz$G=8HU-7Mw@=uLbOOkvyJFtH0$3kJm6bQOMxH5oTmAhp4i&>hSin+fBAJqXI*B(|tJl@%Ui3gNWn={xnHknM24$JM#v1mAX8HSmQF1g$y(#db5de$+5p7aBiTy=@9q{>{wc&y$?X)6Z%L~2f)9h}%( zJG|OD8Pm%WoIK~v>iXua;Tipv&K`4v4xD#>0^=AbUhxjzs-~Wx;r5Hs0kqp1$G!^YNNL_j&lrYH2n|&E5A)*9cO9 z%M%{A~IPK86;3Z=BWb#0a3%CLKs?^WfB4+gh2arP#MdVNSN9%h{_;Cf&oHo zRi+S91`{Dbsz3sS5SbE52>i~CL;LM|y58S<|NXvMOI&d7xo4j}eD*&3-s~MqB_^a^ zyrXk5XziYJ=@_nsJ0`F0l)es1qo73>=q|Jy&OBn30L}ARRZ&s(14k5~p0y=cLEkvb zv;MtdVKZ?;%3PFquK+o71s(VSst&_XXVa!Klj(6jp!#ml?X0vRu_z+i^E=e^x?|2c z>A}RbLKRBR**~%iEVj&6+mk2mW{+Q!bS~dJMExuk(tLzVnL7%K5}ka=R`$?oIx6 zY#hC;*W)Z*a3M25il|BLSfh**SFduFB0xUQ`GFWc8qjZsh-8*7g*;TNyByP-Rr1JW z0R-Eii<_B6Kd8DnHsxj&Xvg!3bT}K3In|VHn=Uqa=67*EGlXfIETuRH4Y!<%9B#)} zo?W2%tb&%3pz;ni&+IO4#c!#2l{t+!YqVORUE?b;K?3~d^JkP=wLV&H&||o$ZcX&H z5MCSAp+hvvYUwei9=IKdUJf`;af~^$bSs+}{e6<>@VLcbwyZVlW?PJNx6Qi^_mQ9> z=Hyv-o0gtPqhyX*4;d%07Y=Qm4a}T6KeKu+nsG`vUMry4QTWdSI%U2&2ZW`)%fuUI z4QDS(;+eMcs&xEDr~oV@U?q9NW{{0p8hWBxH1(Q56X_%J54hbuDKkH82Ba81M;oOi zr=|oRh|Cez#*H>zk*?$e;v(hA!VRa+uY9C1xOlg^jZnDy;jeaP zJ&~Z!bah}}$zfVRf-Td0|6q3ec_}Pgt>;8*S)$*$KdNU-&72aH3({m9rV=y_u$mcjLvS$RPM!|rBF~)swBn?&9u|t!8 zOEhn+H`o&$rmoxo-ZOxlO=cFTZ5d6u!QV*`g~HR>DS5e_x*4PnW%V!laZ%Ii=YrMh zFPjkCSU2&b0|mHoGAOTqC;Ke=mVZ#5RlT~AOwdxQx@Z251-3rO@&ZgVI(7+kl3cnK z5zDm{RCAwno|jJGiTjZkh+nH1AKiFS4=Upvo^Fc1`6Eji)cN(tO7D7)G!Xqm1vJn$ z@H*%cQ#8JoL(Q^qB8INR)6bQl1-H~`i?1xnD0TvdUKX9(8(D#bMSeaMm%Lg-g%I3uv7MnX2-|1jXz)Od@-wBJCI_$A_-(undFP=9VS!8 znabHe5dAZm=c8MnGyg-(uD(1G*}YND+#rrW_nVzK(@6rQoiba~kH#)Sg<-_C+X2VR znItX8$~51|rjzvZ0|A*t0c#KmYFUAI1(4m{pr+M&_2rYK&dU%g!H#2z*Vn!jLE2xX2bErko0O;`QaL1IW`?kn$$au6$ ze7?om&f;QE)8zosXx7tfGPUZ^#f#jCJE5gc_56C~+<8jS5S1_NYzp_uw39X3aq6Pm zF0%Fa`t0?gkDoa#eJ&wjknH5n%jj;cJLx8B$6A!*G^R5 zO7aTRtK1V*U1r@KZzjbeTDqMU-v!USzxTBDASiz!-03<`{-kNt?}?UCmKjFRv^mgB zrHP7fI_BBJz$+(4cKFu!)H!mgt+;X252yz*cf8#%@~XP)jBUSTfz~mPOIKUEYqhml zKaAF8Bcy6X@Gxo8XL@E`+gcRRL5U<^{>aJSB69Hbqj9DuTXd}}Yl`(!S$@k(BOXba zn)EEl1AQL?)*C)7%>*jHeB!!aLS(?hFn>P);?tBTSe3ZFg^w4-bIOlO$J0{gRT{%V zUgpXwqNT>lV)3%CIwmG~%E~0D!W8td_zn~{;(M12p}bJ1jzQ(st-b05u3=>z0f$$o z0ZqJs4m;lU(cqaUZuFGw#gzrHJU`J4Z6HO+Udqn<9JrrUGNltJoAUFsy!gJZnm}yoeewvD@@L_~H9Id;jHf(Jx9kMM zV&iu~c{DAR$GOF;P?M2Y)Nmhz5^t<)kuWA4K>w90$AF12>fF+@$ga}CN((@$$SO$9 zSxn@@nd5^kNu>drgjFD|#94KAmKD7d6F56j>*J{luaITEG z*Lf-CMdyI6g$XDhbO*HHa3EbYNvUTAJ91pj_~}e5fq>1Aq&ES%Q93~>AfR4mza(V| zZ>dO&8+V#C$AYr^flZ__ycweysx2D2cIPN%nmGFdE7cK-do=Xyl)svso7?&2XpSzE zlS9CO%9w1RLHd<}2UgEd(&l7E7bWcYy?mnbJdHB5z92xc%X>-`BTMW`RpSDDnW3Y_ zyf9EDubC7{eAa#jv|ms&D;MkfW?FwVG~VdH9X5dAmO0e(?y{5x3{Vh}sH$G!)5E?n z!~Yhvq&_i9>W)NfoKHvRT4++B#*(1`NNxjNP&BGvRe$Q=lEQj#5IVbDsZxu3U^3iW zQZayT@^+%00WAr1a?(89nbz2T zv@^?k37_my7hTyoaAU7gJ&UBJj#>^A6A(V%D$BAT_juEST+B z<~d6Tjrob^!_7G<$q{zWp{piAix*8UmUd>9=T@?|!oxJSr_8(~{4-l5&xq`l(r3h5 zpsn1e7bZIEE~~c={0}lfp;!wT=*nF`je32wn-8fDq9+(LGrtobzG8H?1l=ji($FHd zf$kOuDE4h#hZcll51Sv+Jw$co*a{-%32irO=-CkCL!)sHqO-2m{37!^sQ#!d=b#SI z4})A*TM?J@Or6TS2lY+Z7_q+GA~6Zdp_mt?W?TJ7`Hmt~(d$mJF+*P2YhVtC8)Eqw zG4bV(+=SM@g4b!l=``WjCX&r}IMW<&qvOSfogS#XhHN6k&NLuOe8uY7qZ2_?peEk{ z_SRDp@5+#kie}SubPADva-;KO^%7ee`6-=uQDsaVFm_jCo@jnd(S==!S=*Y7YNxFRN?1RmCZN$lop#vzWSTsN%AoHUctkePUK>Ox%lhuFK z72%3|ZY%E?dbHkKqQgHg8o1IlISmgAjkd@azol;VuUNIn@d-CRhc&>SYM+^~6iqy5 z&OlUXoIB9A8+4%prQeeR8cepuS9nVUeD+*Z$z$cl$zCz~AR`UYR95>7N{Ew?pRcv`B)|lh z^)#GpN0)s&wW7ggT3O`auntM3gMyAB6N=jyni?kMP6>xpD2;C5|MH;YNt+W$k-a@B zrEu5C5IQi(urj2<7>6F)<_*Y}G?L3rl1?_LcT|!>Vkk(_a^PJclf^_YK`eRizCGXlPnF zJ1yI}kI5)B0sCd$pivgLQQWkoT-(#d$K04XlL)p++Z0TCC5A18RYvQgrVFyL9Tv`u zyeU5?+Tzn|0z56exPFt)3(q(lbHUKL{F{0+oZS?ELh}wKpXC$3$2=t$r?%W9$|w7w zKPQe>M`09EQr>Wi5zI!i5wC|6O;)qm)8fOMF$TPj z?5CZ$_L+Ynr;qpCE~NFtkeJl`p?ER(nrxy3&|#<6e|)Z7sd-3MlS_ueIP z;{|9tNtLeV0ET>bM$v713v2l-SD=j|d(|w?dyu-Wf|lH^Q>bDaC!^Wnu8xGor6z_f zrmkg1#_<$Qi0X6fyG4WHs#s{Nkpr_DqHS#ywF6aghjqiokoAT~FA^nXrK%OdtrHZh z3l3tgiEpT)a}*2kDQY&>-q}+rZZFDe<(;T{vfpZe*#47#$;#)Y4jQtZmxMz`KL#1{ zNSIh$^Sa%DpaK@5!7= zGUIXNqTOZZXxc5)qk1{HzjT?_$h*4`yRs_66&0oQ*Zo_~UFm5_ff8=CG z=UmxtY$0P%jVjT!JvqFI=JGb zzU46;p#a~s1WOhS#p8fB+vPsi7N$2grKtN^|9DKLk=J@;gFGY}D3;4DTWD6Xm+|#d zlDJpwU~xIp!6IiIt7Xs807Swz%BCdJ&CAy*D#ybn=u2k*W%EQ zD5>_?%2-;x_E@_j&%~E|5EYb~x$HlU6>bmn;+jPVEH5It%nlKzd&0T^DE39e+)JON zKTKPqhqMl8h_>u90XmVp{Te1MI-}$UCr)_F3P%JCbU710+V_Be$+$E41815bV#&_0 zW7{yqx*Rt}8*Zv#m)ARVbWfxlOS2d(p-nZ4Yq^FW=OeQo=TYC53sIYG;u~{%-KI>Y zo_x9~c#}J|oyXHd^@L5a!^tSXD)M>*roy;w5Ylfv4QuRsC+Ft8@cu9(kBo^Im=F9D zg~-#sIiEr#x{&+qU&i7V-Pm9wJbN&krx*q5q~Y=s$7aMeK$7Bw=FUkW8M zEeZEf*d2oEnKhI05>0T(?2&v1DVP>#3ZcXKW*C^9Ko@c*nw>SYP#|;=ZzI_8^cUqV zGxw}kg!iXJQ#d4`>!k2CJ`9A=E<>y|EbJtW|GXpap?PNvHfv}~8j`Io zG*W?~`+&{+MH)Kiu1>&5s~PiT*`=A}{t0P-P0ULpUE+|V{DrWAYy@Ds5h@*tTa;Fc zP{p)r&Pe1NB4aSA%PD_>jk@GUH^UzrW#8F^xj51MO#I+#~EZTS)_0utYW(yV^G%R|**hJv$N8 zpcZ0?>;AN7p;~QYiq{=25I&a!KT+5+5S@7voHre@CN=T>QF>GN*->uuj3Nu{5--Lp z4v^J0zHOgTbT#n>zCqBRt_e%Z)(*1rjZqxqp~iw72DKef5o*jF>BJ3gm|?WkjN0ZV zy2;57!2%5|kD)jKq_W$T`Q#IYCfy8W;@Y`>fdtX)nuSr#XdRsh&L#_iS~YlfT$s$N?K)^?K|bcFnvlS-1ae=KD< z{(CZDf7{j}|COu8kcffpYq=S|6|>&nry-4t@mn>;iRQ<}pSDGB2yp>I{Bk*)8Jufu z$4it<0#$YHj48Is(XR~2z07a&x!5QCvYDH70njb_6j#}ejtOK?mL}|bCBSr_j*IOR zM@a+qNdl>X__<0mFYM8TrFP|P12I_5ndYuetf2Wd_;2^-Mcj8dHp8ahXG=z7~y3o z!;lzTuyd>IaM{JFAa3;l${UiYr;Id5k(BvJX#szEO*0g2gKSz7QPJ@ z;;vlVvjE!08QGPj*qDM4&KCSI6H3}4%9o0MMhyY&h-s1&V z$1)Um-ntyE!M4?e@3t_;Y1t2u@U-1JyC8ndD4wVovNkN1YEfr%1s;s`o;Z3RyqTr> zNL%MjWt93Vk?k`Uq-oR_PLAwspsKowr!@e*^N|`vItshUM^f7Is;9|oEuoM?toT)o zG|dpPSz{BzrE;o`CyISnWNCL1iB+qtoW=S!8x>Sj-*z*=(1)6Fi!y0yUp~vXg^@eO zaO(+eT|DR(c(Zv{_$w5jeLYW#micS?Nbusdq@qN5C@b1L)B)6^i>0me2|7C6{vob& z>bOr#YZr=p>Wkb>t!1@zHC&fd1u^K#AeME?iSPT29N5SMe0FOya!3*^w{!2Qz_!&O zGlaqD;bXTTQPPH|r^h)$?VinH)(z<-MMMcOX;XjcWVNO1cm~$2;uty=YiO00U*AYf zkTHxyBLjl^tQ(UVO-ERpgLK(PSj}=;D@8BrAqJ?9e&lCw0Eprd_#iKr~0ox(p|CSsJG! zb8rzFf}|k&a>pzs*p}JZT~1YwG#KNd)qBvsDRqV_ENBZO!_!9`+C-Ku zzN=Rw`|dUPaCFgIaef`A+tdmu#Ktz>mj6Kzu?KgsjtV1UqS~@}MBo&$oA!-X<>jy6r57(H~i>~!D!a_^rb$_BJ#RMvMnQf1TVOz)fN)5qTx!#R3@IHXZIZ6*ichN zQAb9^%|}>irwn9e4KsF26!pr)0U!K$&xrWAZ&HJHg$gt1@xfQog07`3E6}$SFQnk!|3q$)!j*xTE|Gw?2uJP+Cf61$K1Rb4h%^ zizk@|4&s(Ei%Ri5IFopkl_MT!u~A5t8qG!YCH8t}Zh%ba)-rRT_s@will@4~pyuEx zLv32l6dQFu{)34X8Qljz)C?WetbSwqq175Ddr&>}yq4LWrP=&P9Nkk+kjc0Wf(UkXOkTsqRY;7k2^>z6YGdaD`y*@CIzLlN-YCQh(V+_ zZSaU8&8eQ4z@cNfP)M{pCe9$n>R;k5gY!X4H~i6^8e|X~m(wQV*vOTRR4-;HLp!Dk z#)^7?67YDglnOWL*8-uBd}=eMt>3IdXF^$yIB3aF#WXYFy47i=n)E8(@b+RUdG9q? zdQ@ZF%su8-xm2rjtt$Lh^%y@XEJBn*(W9zLX_xJJ_l`QNYepIX>_ImluMD99Tu_@Q z=RB(K`2*lfUXN(v3FIW>k=(8QAb#NVlcTK;qqpshz{v*h2lSd)Q4w@fB95O=il%XR z5Z7HvsHK)*q1pzycUA5n$u|}8WIt&@?$*Z)iF5kTynrOSW-obf8$@2>!@gi@BkxqG zQ&3uVH5(;ZJ`W0$=1G~tZsQycR{s?BZKQ!!j#yG=HJmRvVVJ!Ektmo8TE+)miW$hY zjS+2rkWtwdGQVuqKOTL3k;UsprJAczVCWc#Ygv60!u#0ZvJuAjZjCLoPoSu#Qr#F^ z>9?9p!+1b49a~jo6S>VjXbLx1NY2_iB;p33=$GmBku*)V-C}aH@#K$nQV-GAcUIz* zB1(74JU<&dIWo~i)K0U=vDtmP*&VI9jf9PLBL&}?iC)Mmz#69 ziQMiWfw5GLZ6ZF%Yrc$3K?b4>RdOSzSo@u$XuH{W*D*24M(CVAGK;#_L7KILpbm9K zFMV|kCwF}sIxuuGtz2lPNn)k_)VOgUSoZpinNbKtV~7oRYMNZiruy)z)s~%6AOMUu z!|^ul);r=#BYTL=p{W(Ld~h~*b~E)g2P(}a+wp`iN`aTwa@gi}e=Xp3qoUoq)1pOJ z1h7UM*D2a~(xl}-SZ^_I4 zo7Jz==HIs4hB@#aq>!^bvjw`c8#!2>=;AhrGXot+Zrv6^k*Q0HknAN&e0CQeEf`Bn3-^{1u@~1o`Vae>ftbmyhmWCqOR|eY5v}^Ta=P zzS!QHujooy+T7lfQ62gsd(vObIbRiKC!?fB)3&}#`M`F`&Q=YEA+#q7_uXoA-}*-R zkNxl=vTiohQ>pAG_#LB{q43Afdp8NSxxa<0z=x8744&WscLTmb!!q*ydXHrp{?AK! zgj#}u^=U*xIk-wxEx+gP41%N2@fuz5=XVsO9!fjz?|QBa5_kp@zYM}Ekmp?^AgN9T zw+w>c8R6HtS)`}h|KqObJ(Sdq3v~Dc;2F8heq&=u5}^M-h(YdyQ|i*`S(TL>h8vXS z5TAIn)FnkYqPXL3;|=eGI{A+OvIqT)3-benq2@&>_qLSxC&^P8g!B2lw9`h=*=<4# zkhkat?r3!1Za*BQ{Ng&BkprDN|GG>r905?(s^M?T|EzoA;@@r_kV()MMz*T4pd=N# z>J6XIC>cYEE_DAgR#tI1tZ-AfAvE^w0-F9+ARzk6j(`h@<84dix1Jq@P9}X5q$s^F zA=~!#{wTVSEQ7fHQex-ho&mb_e33KrvIn^}t1X+@X{&Ps|t3;p0n}Fk7qmQ^0VqtI$a*3$hFSXVu(*MFu zgA8ov&7pWVy$nNWXVu*P5=y?{ERc5>5c2c4CV~*hagj#P_rY;c6z!4hfXtumA0CiJ zZAs_s-cksXZ*r+#*Vp5nEG+U%M8X)L*`SFuQ*5);EQn-U(XmZCzQLV*DT#e>H$#b7 zH;zk6&ShDK0VW$3v$3#{UNC8(xZ}e*#`tt}pRsKQ5Gxbl#Qj}&XAVo}2;uWTYy~Kb zP>r~4dvLsb#mWgcMC%dOBF7Bw1qb&BXD6WfSC$C6{EW^R!%~f>cf$rZn#8;NK(Ym4Hnev%BH`hqPQqL>-+Hhn@b=2jym3v0XTcn?*^pf?xupX;s0S=(%EAj^X z2RKNPu5ygeS5T~>jbg{s2;qj#IVd$_bA{*mavZqPapP3{=M{(2TEgAJYSDryHNC6@(pz*e&5IH3(6LMX5K#0I4Ip>J@38ZE6i{HaDU-b(bM z;~*HJ`2R&rRaI6gTxTB-!?iI9P#w9p6he1%c&Q<`boI&K1mcHQP_G6@^QSMiPD~l9 zyZy;eV5v>52IC$J1JSG4L_^53a3FvOY-V^5LyUQOGp^xW0a6LN^dJm<{iS@0A;I!r zip5SD?pBT}sKcbcluPJBKnB50_~jec-{|=HG}y+Yd@-&9WC8{!tXU1Q*{{q*3C`nO zeWQP$CVgHp(X8uheJ8wi(F(;b{Zj#V|1bd5_N4l97wO)bJ%oula`usTNoKHeWFe!t zB&%&q>x2fR9r;@C7lSp5t)F3{mA+xg z-Kx%eA&Ax)nRaCe!;0YNbD@s}!j_q9(NLN;k4%`GBXG}3RTKiw$z_i%RBkiK;P(F* zEZDhS|Ni%ok@P__%BcqrIEpjh-1w3ZEFG-sL@P&S62h0vYx~2;+?Ievkn1Ls++ZrP zV2gE?zyOs0D9V`~6`aWpKsKaxQ5ygyZ0stOXPHTa(D0=|fZxOJKZZ0E z;Rek6__^503-eKcZ=2BNlNUl6!^;rhy|UOQU0>fl+DH!DPh_yWxuIGGmflBEhBpOE z#kSp8r#CtTJgjbGON|$q`$c*mKRC*X*3cA+&Id_^Dg1m%r3I1;V?6)m4i8>{9Gs<{Ie!smF;;`tYOhC-w_dAb} zzP2U4bnE03EzBps=t=_B{P#azj*sBPsHiGj>3a#RC`93|jZW&2bc&6b{uHyFLD)VL z6nCq446lFv3&1UKjDvO)3m-yZzw|`!2k-0$o?mN>(;KO%SnY))#`p)+vNx(_~t~y2^Cd}PA2pkhQ!&a6k!G#y6hQQxj=%>vb zrZ&&4Qs^Zm7UrA@TQ@}E51vXPAun(6T>>0oS24%J2gaoRQhY3P9Jv|-~4QK(mhZK6TMpG4sP3#44v!%*+HA+?c*i7ZDs3<&1uM0ExOnuHlnLC z`ODR8dabSm2`)}tbLp90X@YN{jylHqTxuX$)sNN*&2uHNmp7?V>Imn zcIEf#C_i%()spPNP{%y5|T?D*KVZLr#>Fk)JS&8o6xCK(n)`p|(-1JRd$+M#rAjpM^(+S>)!Xofb zxo@p{LWsrc%PALl`_`qxkW*3RU1e8~g}8dvIE7TM{86o++5GF+bNwtrs7lB;uJ0dF z_)0U;#AEx@f`Y=X^w;16o%an(&26CFTQ#iM9os;dC#n!c$f66HYeVDt6Sn9*;X)?p}EHJd5>= z6&;prkJvuMpFcOv8@xN2NjR_%Uff-X)sObBv3M*9OD;%mNv`#nO097W2^vnVHn?33 zk9gGlR4>cqJ@11dUlIDXz{$XBx88^kMeh@jgWg{ks=#Cegb9rzc2#_bSLnmi(0vr? zq%iGKXV&A98zGAl9HrI*}ce- zj6AkCPBP|LT5b9dXJqzdE zWT=dqwu;AfgwBojo%Q{M_77q8=l40Oe6OE<+j1e~J}4#qxbK8)`s67O$*^v{cfC_?dHV9&I;zf!jS#g1Tygx78WArC)8@u?mA651L zvZiI0sY`J=%3jk(EREv_kKSevN_czy0hz#-)6OF7Zi4)c-Ocb6=~(z#ghrj0B54rt z$PM@5z>hVbx|(9MPrIFcg!or=@J|Zg$YzdIWrd1rj^F-G*VH$j?->K9Md6eV>X~mv z6g2C${VeN~a<;e54baZpW&3o;`0fdLX?PgAnyaywPLb?-u!xP0QR{X{*=sK}H9G80Dlf)1^LBC#SqH-f2@&`1&q` z{(pr$ko=~r>R*ir{`BzMrum=0tw5r=oVb<#^R3}e;Ewy8^z?5VxPF^=SgXxx_RcCl z8D6ZidX>6rLtTAM;;iZ*Wc33_@K{I8=UzXYNk8o4Bt4*T?{sB(Wy((phv%K+bWL=OvywHw%BpQdb7}xTym_fG~~Czi73L*>h=Bm;C;2ax0`D-2&k5fmHSm&`&;!h z<5hjlkEo6+I~AK0WuJ;D+;OeY+56BA1u3o1dB(A-;;E{hhKExy)Ajv~!hMN0zA;zv zSU4H8P@uW_$Gd^XMnUVxLsdHTuU|Zh=)U(pLI3LQ1Gfu$t-q}UKl0vD&^Wh#Xx?!+ z7Z~>xFz$iF{YOgiE~7Zd^`aNN(7p7@%lnl_w!bnv2}oM-0m|(gxc_l<_}EVahCs7a zi?iJ$3O{5hcqMz8e5!E2D}DvBea3V9Jis0OT)velw6#g*w-bR$2*!ShU z+i*$nZ|;E~hnSo?Iy))|#kdr9P$Z2HsmDvgQ3xeDkqHe3x7oiB$GGpRjPi4>j0)f9 zJocjZv%(KPP`KMznmXz*dNOR|Y=l=!mPv{BBl!DSN-E?JVT_^V*n)RI<%w>kPprIA zJU0EM;sL^rp6j*^su(tu9+n=F)|@&VNqwBACO6@AnR6mg!NdM8)&&VV^9ad*bk%8o zF2~E<6M*TKd5?1bV-DEXee(x|xnVWJyfvdzKiBnN?}x&b+LLcXUO7pKaPev@t-hmDQE$ll)ZTsf}fbSb(x?D75pZe6&h zSA5NLC!!sDa(K*Rk9#J;8z>vEpMdu%w7^>2u&{<@H5&sj_I4x*(O;Rr<6hOj-dA`M zDp54E`ux^2-4MsnJ5{A85P825DtrpB8i?C|E(_g<*__r)f9=c`$c`!_sg*w6_l&Mz z%R>wUtzt6x-tfSRh=K z?hG3}!NJ$RsBEqcReoF_jsdEFYq-`Z$JIRT)Q3t6ciqc?TKzs#Rh?0(w7a;;qa{<) ztjnO^LjG>D_o(TGl~1{fsZdjy*CfT$d>|>ldoC;sfbUL(2=3as6}X=t9G|l1-vD+I ztlb!g{I06dkE)%m&V3xxd@68{d)OX#H$nuW)mru&Xd!}yn|0`_ehLH5^7cyj?S2rG z1SUR1Y@aCDi2SksNWb;wn71P2tT znrdZH67DJ4_t4MhxPDem^-!y&KJlr-{U%BJ-M+pPa+hs-*5o^FC@z}#5!Wb`Cu? zy{<2Fb=^Mv_@uvnmQyvN&j~wzN#T$4GKVKKRYyX;3_P$Nf(YF|Yh{&O8{hTqG1I`6 z-)Dix>>~0lww^0)KYzVtS`;BGr29CuDvA~e8Mcq(nDUs;n1%O8kFU?H&FlufIxx zAx9A+fNbkJ1usnmVIg0=zxB}V{95Z`ZEb{K*^sMLx!%G=zV~?xC50;*(9UP%JFhnA zuRo=i887t=y%$_VE5T2sVH+KcBf)6I+|!Vmx`nWHKBGIPuvK4h}+A5U06j z7AkMxMYiEq487a24m4DTM+yj&pe7}+4+$V#hRkb^VNfa9B;amxjF$-B-?3N{;gYcQ z4%mcUmUUqSYtM*%=;)XlerT7dCiYV&)nV0WTZKzm)*Ev)C>%9C2NYNv9)szByXcN? z|CjZv24%LMI@-?Vo@=Ywgz#Tp1)!&2cT2OgYyM)bHsVm)zTv-+PfdHdWEjIELe5hixucGq}jiYOvD_G;Diql># z*FB~}I49i*k*73ve8YP1Ahgw3;eNDkM$8c?RsyH{uXgR;y*qExQ|BN!749LFdyhKY z)K&Cuh_$D&+NifSmX2!oAWJHu(;~bwo|9ie)UOakx&IqyyrdcI-&R)LN$V=cI-WWJ zca(o-7XDlS+#ULmGygCClxLWD;cW$5YT*X}3Ew&l|322v<~(%rOy#R6++{upYegvB z7wG-DnLm6L^M+I_Jaw!T?ysKc`yJ<~cgVwWhSXBHhdevK3vU$D)wzM^s^SY(IAf`? zAf<5n_)WrG?)Hn^H_>BZvj+dc8j7JT%vGifXY^nzKh36^rS)gE9e4PB z^UA8|Xmw8`M$}S)dF@_3NFfHMHJqv7EYJRs0luhZF>~TSXBCO%SUBeEcNAxu$QBuAh_@vP}<5K82V}6IHtxK{36^JMe&agnVI5j~ahhPRl@% zQ>8{&+sShi)Jxkx*6c(?pNRg-;p4=3)BFNWg+C7H9lU+Wb>pXe-IlW+duA`+R8j(7 zQVz571NKT(nQag+tR(Q_)W$q;D{tJ|xfvEJV`?He3nlfT>%01#3qef&2a9>E?BsoL z2V~y4Yj*vup8@ZMi5L}|kDTHyoIH14_J*uqJwKA<{DB;H;Kc7}Bnd>t~2B8AHkO5kGDSYMpxZ`;8@y}*Nt3$9k-20Ue z`Uz<}i5@RAe>-j`MXW6L_9`Q1l7GO&6%L1c9iSJ%F^s);MvO}Lh0flz*c7!<)xnLv zQTG-O&$z*!x#3gEk6Mr8(~d1X4M9-P)I$mnx~%=Q;E!)iUD@Z}m1oYmHjZ=hKCs@B zNh*YAn(ANQs?(L+s*vb`3l%PVQQTG&=`8cWkyLNbYDA#?qzo_i_i7`Q9m{lrBL-@= zpLXf?XMjgx;PYDmayy17ye{ISLKF?o9*se z8S%AOOQs>D9^(^rgv|N$GcOaBCjqCXhEvb2meEV_U7z;DB1S+y;g{JRto7({a532j zj@d>5I^ac)dHX^i0mkihSl$cKzgU}s<4Q|}(OoxPBD{VWYBPH|wHOYj76X`llCmUf zX#oj_8-qtBErhw~ShOlM(dIVT_*0NO)eQGIfJs{A))+wtV~zN-u>D|RpuwU0+$+2j z5)odz*5Qb=buY88pRF6xb1fXky}n+FVlxyFuy$Pd+pL~Bc#6V3l>;nGbDuCenN|npOWlF?zSZ~PtJr#P$oIjW53K1 zWN{b<_(6S1O=ybLr`*$vlV*NSsSK70^4~sR|CI$s00iVK$-}bnO#vn*U0d70I z}rvf}es0NQH9>0ES0YIh-8moaLx;6{~^U`nWN>*MyEa(LjMfw@`RI4ss z$*-E5pF_Kqt6|rxD?Gq-+p~zZw;i5CB9)uotD*c?Lo&{`>>q|O95Ln}hM>mV|1f07 zS+)Czp-GNx@*jqVb^gSs?ZE@S!9OD+ww~cDE9Us7F6z zYLjaL?F;(-FPDPH*x-*>`T%=0?&wMeg`20q?{d!kZ74_h;-mjQlrmB>^+q}`AD|fd z03Zq42j^@FIMA&7kl_C3P;Bb7Dn#HD#Jb@t32NV&s!pyW%(1%!Y-p+fGW22W6}faE z(|-9o}#&P^@){7rwp>6z^S&iG0M$n}4zYa;;8{c5cpEvxQDiQ;wMCjg-gZSH! zS>USKKMXw%ls*23A@9I-?|&F73fw6AhoOCe%lrOeXf|+b_8*2q0!1PJW#~^`h7`7I z&5im7^&rF44qVaxKhJuRNd=AD|85-e8_s2R$sY+q(a7J{dcKPa8d(0d#ZZ8k=4+MA`Oe_Avh65Tuy?ymqP z=H+&3zrE_9`NuC4NDmDeJ6#%kRk-hPPtUkzje>#(Qf?`KoDi25I;B5cQUnYM)^G?#*RB|zCKfE$=N9W**xMc-oh~i(ED&!L6I&-PY zIU^GT-aWZAHiz;+ykpxgM~3@(8n961Uv%F(KnwGC$gIsJymuwx<-KL~)b%z6jef18 zI=@y%KMi@}tCY>$`P}-%zY`h`-JS_^qP=CC@p2nmT+U{Rg5oTqVDGab(Q}E^Vw=mc zRo_V~2yx=SbtM6{6ERz{y_#O`wO+)2%1N-F*`45huD{S@_x?9H@S0m^ zLZll3ygLmM3JDanhJT?7)<`^YD$SYQ&5wvg9QReeuO(63e(GS00o5p|MfWQKrU0N& zi*O^AMuisl?nFmf_1WR-BG>Spg}3W_0a_RH5#f50L>3aCaf zO?@32E9&7xKuEdaG!ETv`|{fpU%Zk8GX7y77n`{q;l!U^2Y?x14d4BI&ntJq+8Pgh z`-(0yzkoi}-cYn2VA?SU7D%`AGit;w@`!O!3XKJ??)+LLFe^#$Dn5(Z!v`GbOvpI= zhc-Yu2oxK)9#h|ceugwtA!|uD_hSV#Z`Hee@~N**WA1C}+z<(CoS1JaU+Q;Gt2QI9 znY_Ng)xg4+v)jXdX5Em&+HBkOvt4mXiGDy9)C*E`q6heO?pT$2{(CCl)h=7(4$?L! zP+bmOx`3L-$;JlW2{3Y+4-{IZ^1{E^#$9gNA}!4 z?H`;I5OfFD!=H$2^6rjsbK>hq#z#aoj$K9e9BU*eZmVHVRur%%SIVWq>dqMZhvPC! zaR5m}xewIKphq^+5md1P8U&*+Wu0UDt1y~!pzr#6IEH2UEZB~n^zeK`A9>ez6q5K_ z^u)?kj^)wAY|^@A87&~FEio{MnOLtSn#7kPSxjSm=n@5Mms_G`|D}`YV5PltDp*gP zvShco(Lq^za>M$<1Vfu`lcJ~~ej~T!hFt|u^6gGv7I*$^gR58mm{-@AP?s}4WtUqF zQ$B^xT}G)X7un_TU}NZk3!pwgnOSj+CzF!I6|rCes|OizbwW&Cy)5g7Mv!wfifM(Z zw~B0=#j7Q&?O6QfrSqb0;s+4ItS?ezFvn9zInc0sx#LYFx`Gsh2v}3m>w)) zwRdA>`M}J^pUuTt<05l%VN_?dps_h|!im37?~<;jM%H9xPhw?vkE4Z+S4J84iNcQb zNv+(>e!L)c5Oe9F@jG{Kz7ptgpu`Ol=+?Z0?B;_+^{@FqJP!AdR7?ImNe`VRzz5s3 zoD2|Kc{C_GY9T~^!(6X3BW_?a(KXaMgp6<*azeq!$Z{{$om|`_)Tzg~rw3*m? zaQbYIM5E%{NC&M2vkFwnP2ol59yc zo{Zm-?%qH-FcyE(4`o}3)Ko62pKhC#35{byXldXZN_USYD>BHMv53zWmp4-RL9+4a z^XR(Sb&^^ID|lYWM0GXRSHKh))vtgMHa7i5@U2cJ*1+!H747RgJ!bL70VT+&Ri7dHVWK3TAhY&zoY@aX#EU39LayOPFidk zN^zP&t+uCLj+)GY^|0%~r4-)8e)#T7!dKDYuJory3buu?4e%@oAP$1$HxUYm92{FY zSY%LAurY6KdWuy(6RX0vL(P!1D-#)GEA7Ytod)-Aqad^K+~m_NwWOl?6r(Yd%gMPw zSDVZuMXUg;w#JohPLM+xZO*LG&K!^u85rBjX!b|uknKmBe4dT77NPs0*lj!#_+EM3 z+s{j1hL6tU`7oU&){~SS06DCXEeQrID5OdIXT>efCS12R!T7G?A#0vhOzHtmO1nFdzlCJGc?H+GB zh790a*=tFn#z^(j2tLr!EUmxT0f0Re|KIXeO?(-9R62-0=uhI@Nq&clVC=Kt_1IFF z^0$#am-s($&8C%+x$DHn4s%u%YuuK%+%?-0#kap5l}v~EtXd(RJx2kZ{0?0bI@{?{fn?(lR^z-~8O1Ai7g1c%8~> zJz4b3h~%emG?FBo`~TW|)2Jq|?(ZM9>Rq+9D1s-xX^zj)Sqe!OVc zYLn|aXPZJMi+N)rN1G0+P&P_hObj7RJEdKV&P4^JB6!0v&|KdH$97eYW{=aXetZ<%7e|GrbheAi6AY^%y zt>!ceH5=p;ndO1DKFrB4BquTdi8edMG~1|O-Ae#fOwk_|20ej5j;MF7tJeQVKW3s@ zsqZ@?S``WbR<`S_3BM0sXKZ}$NLyvAuiaQaGkK0rvR<4EQ)l2e|FLR9wEdR;~Xv2@w zx)e#2852+cNuZ;j!&`Haf5;T$G!sSG1*0w>W77G3OCm;~xf3JQZ6CLM8{hy1B^&F> z#=`4eZFtYJXKzT`i$QFtE4DFUme>43RpTn3ac!G&qJAqrx{u(QPhc*2ep{ijujWeN0T!9XsUtsKhH#!KZ`sfzp)hhR_OtDOU$kO zsQC$gA&wPajGbB@%4HT8__s_pQ*a~KeEQ+J#q{%tc}pr9ROl}M^m|$LPnTy~)pN!^ zjBb2#vn9k+#Gn5KbaxB~-cjaUcPpyZqGW8bF@R*VAJs^la#f#Fx?Dha2RCqmgdK6X zAxKBJXY`9ZisdnzX7aDWL%_{6uZ*pAyB?cj+*fvkE7PqulnX4Cj(?GimQ}0h?OF$7 z8+4I{YzSZ)HpDSyh0fL*)6UwYWRBYRWpD5N;KP5b<-=NglBly}w)vOms%^n((8WyC z^?nZ2u6~HJtq7W zrAiKbPo5X#+~qY5oXT(Xj(x!vss@Toy8Dj1i_R=gyJ8mtD`roC?83{E%vbPnPz>;I zpm+0ag%`j^A&u?qYfN&8_k>ZJ!y~4z0^fllPRw5{lba%SfGtTEPUcg3sl`ch^tT-{ zoS+xZi2DRehry@SY_$Px;RB$LXUTlTk_NR+<`3I<#=O!(Z2lrAlT7K0BNh2x*#cH5}C~2QAxrEc zefzpbR_?ntFCu;1y_r6t0`rGctlCB!B(YwDm;$Mf!>_%`yUgA;*xO(<$v2#}^_PA!xmKC7I@4aj5b>!6nGoFJ+X~6RJsneyXK^<^ySd~lc&gPcAtB`1$ z=p5S1k}ZfqdRe>IqqZ1#s@*3@3>Hi;!cOBtyrax>)xfdG#bvW9Aktu5J+B_@^0-9A z(Y@+wo$C0^A{E#9ua5N~CjBj|J>W~(OmV5hS5HEvic_#>$wcqazB-5y?$Ob*LV>ld zePj@uXwwwcYH?*QW!F!EoC%Wrg_N9bKYxyQ}Bt^RSCHwoxg_o>9F` zB*j+#VX4Uegkq-`Z}{6aR1Xg}3<#5xto9B3Povn;9T=}?HsTReshuzRCYtzU@i zB)u5aO08>Bz&d%Bi1=C{^Aq0%j(hDnj+2mCy`WM(-{BNg=kms(dXVHPf&>-MuD%aj zU+hL^l~%xP`PrU;4HmVFH6c~gmyd84RYrS!F4J#b>0D^5;c`1XjZ4PpGE`MapF3}P zDr!^O@|^}cz8GhXLQZuptqy(@1ag(&$5}j+`pD#K4bFYF&+jXD^j@BGZ>_|Qm597- zo~wLM@;m@9Z|!AO(4pR)n@=dh7^FlOqPiQ`{hZur{-Sr7wS zM6R8AIq>OIjYoGCNb+r`$}Wy1$6l@r>GgFk*xqRjQgoD_$&Rrv?JIN2{9>H~w>)Tz zS9?@t6ib$_1W$y0oDBTlE8oQLkb0htzON-W-O%n6OVrD6uQI4n5yG+T1>RN;WH;fi zu)-S~7v`R+1_lKq(X0%o(hy~;LMF<&8Z5On_-0$w%n@WT9H7}!!a3_Zf z+fML_zzroBj^Ug7YS7OMyUp%ps=I~IbJlZao5S&HQaGhDEX)>8A!Ow~&A!;HHAN_3 zidKI_=CUxoxNPt~aT?}u^_WBTFzMRbG{Y`?mga8*a9urvKAEdYG6uL^ETI zj$ezVsd7+bF`ZrX1{Dw;oABS^>Ce&fsoVYnQtlxKBuBZ*bMTN?qSZ?=>02rNGXUI` zE=5huEEePHksZz?;d&iQ_?JD%(3B7LJ9Ar)=CLfd*##bIw@n#wKlq#(I8iM-DUa;X zElXS`w-@=gFIIYAW0r_=mfP^ry~Rjq@U6&WI`q*7=3q_v07?#lJ)=rj>LwYmgxIkc z)tOpBJ$y2}1-(-I=#x^pyIE7gQRNpH-!J!czEq`bFOn6?qKx@}u`U@4r^p2>o(puy zm??YQ8{!_trIcRsoIN4Y#tRIKFNPG-+vtOPU&Kc+d#Y7q_CE`u5rSOm@Dx z;a4xG)v)M9DAJy7o=ikvA$~aeY7>Gc?sJa|4`~c|*nh?cR<(_MFJgg6X}z!?#qX;) zdU5sG*BW7X_qkIfKgH%zL%SL_@`)ho5bltg-81$Hf_xIrhEGD+U-{4jM@T{`( zp%`GVKb0u)eLMws1s9w=1yCmT7Q58b;Ad~Z#V$q`U%SNh$>3vnUZM!=%=aFyeyt*$ zw;wJZ);=y@T`c}CHo`&ZfBd&bR7|@b-z%lHHwUQubhe|fiEQ9wt+sFbQes~XxyIu# z-^U~bwv8-C3sj0o`9|G~{SV(Q{cn%wN5%AQ<-3K{2|@ephUqeaRVMH)BvpTgBzO)a ztym|0!|(XI?yXv=u`ZTVsp7x4a7?Fd^&08^ zoU)|=$A}%y1QI-wsRkSsEoUU0%yPdW8&-G~?#ovTT`E>+xg{U=b|y=^1twaDhW}>Q8*zygyqhCee zU~0m2+KQ%zE_60b{XAN#qSF*aWRoONjr)_mJyx1i@JKTy8^3!7iqN1kkUWgwaL~`J z`LID_AoG^3z-l>*WpLAPos${K0NtSD<`aQ+t56)mk1?hn7uou=+wW<5NbOH}V%}b( zeA%r^F;cqu$&}b+7cud!%0O+;6lu zWmkK1CeL#Dfr<;wI9O{!rin2;$O|@0T{o;)M{~AfiGgfcrkb*2H!zKxnL}zX1yGiP zP+*y>d<>~&aA6c>nRpuoCI}w$w4zn(j9aDsnHpJBopAGK_~FSo7egES=|Lt4v>>M)c=XIX^R;lc&X?r15(-jt9AXTxR$6&!=1GtEdIu ziZMfVDe_(mWXavcrqK*ZBNi1%%r!L5#L37wp+velrO6@lEu*b)fH^qQ>Hor9_3-|< zNXG5NrvGNjEWDLCKheO4xfFl8_GA1G!GHDRta=Rja0m)tnjy1r6pB{}g-@_3t{=O% za@2k@V69|t`C07+yopYeGgN_Rr)xoVaQcPW#j6O=u-G~$I&!+{F6MN+p-OYCTQTeUi2% zNTJ*B1CUft_PDC58G_z&C=%6&oW{pgq8)>v z7Xx3k!Sp6APk1b48T%N(W;NviP4W-tkhA5(&4FZrn)4x_}LMhY12 z@V$C-yIvWBvoiG30)C$<8`=%EU%i`tXy%@pMl}wrqne>GI4cY4sxD?iRX>#7s}^}& zR4v*@@dz@a?9khi!JtV0gO!U1Pr&LYF=Ia+-mD)MLHy8<)!-J*51D$MUW}Wgb2;nN zQY%r?DLF|GYT}`GTg6s#z--SMKgIUR*Nxg+_$F-qMA2Gj6Ld!m5x-9P2Qf#G4fZ-Y zntnb$R2y`cc!h3)h%ab8&W3tuZ#)=wC+2eF=A=4hzes64aSAbti-27ZdMok0>yar0}ey#J*I+vE`8}Mw#o73w3t=xRj;(szZ z?t^BthW~7`b+0dm8>5Y2h++hAW3{B9xS>;z#d2&Lrf`>&vg5u#$tg`8pA8~2vL(Z$ zky@?Koh&GC<02(%9Ys8fF6TG{=$zqt#hg!)xA}$l@?9jVZ+Jq?ctXs65|<#bQ|gGh ziSrg?V6HrBcO-WGm~iF@)iFm#qx-j6VChb-TbCE~n7VJ=1<^?5FTw1sS?XE5uQNOXiz#4cs~qGsX{=*ZhZsm z_&HFLsF%s0K+UBGsouj7AreFMy##$8nuek>pAV{M%P1ZDemRkXy?w0NT> z3XMSvz+_tBH4}>BT?BEP3J+QC#q|}&NJSR@VJTv{4ge}}?*5Ce(>!>zRCk~?ThrjE zV{h7BC=7fO9sKq6!F8H9n2N+!3j{$8&&L1s-s4fcZo3)~fQxyvJpLm3(d4P7P>cd* zz)YfR8u+I^XY+G9aoMyVb#4Zl(svPSJzW%1mhZZ6#XXMfTTF@mPm*p#6na(HG4E>C04m{(V1o$#|&L!C&aXj_UI)=#2Kn{ z#_<$bF-+rs9BT3)-#6vHZbnKw1FRa$3bWGyBI^;-UqRa1KT1dEocfHrh~T?#oDhNx zYdB(t6z-OFOT9&T-8Mp?w*E_XGZo~n-)48}Ou^4#tsxHcH>e0PL@iOQN~%qOrPig@ zjAQIHYsz!U@oFSAiR$(eG4keb%>v@2?|xRLO5amdcfss>FY7XLB}Xhd#Z-VgLVRJ> z1Skta-AzeAEqv#@DSh^xr^j=Ip_v|b`<`V_WpB%Pg20OD4{o*Sw6=V2cn)K(fpLom zmiL0sON+2@g!r6VdJtqEs8s;0G2w5ggNq;56N0BaLNE!LJL`mCF%W{+H)4;*!1PYO z@w4SNlm;}h>2ZrEUg};`v2$!ga_q1jx=508|-F1U-#!M$i}q)5Yk|1^~;p>stPl>G6AVhXi!#;2-aW`v8qN9Z!haRS*WaQ||7XWifz(WGKs&g_vn^6<AxktgO!yH z1&cIYh6qTt)jD21153XxSipZJ+9LcYZ#mnju66A@K8Ky}V=z*fw`L3R)6$9C$Rp!; z%U#Ga%YFle`NypVBezj#%8V)uB#7uI8^7l}Va1JBQ9oqZZWe-O+=G7Sk^rycm1&o0 zNjr8s0JU!Yl0IYsfZfV4$71IIN$V`;RKwO_>b@ZL+2nYAIa= za(TWrzxV5Fou$`n5pv>a3U9uIDpr*Z$is|gjpNn{JHr*iPHUQA=qCwpmcFMij_&O> zd71h`x;bG+zBLim{HLqUzL~dZN72aX+Z!OHaPKBdM_d8a9R(1oZ5nWHI!`4_(+B)D zq||*zG5-SW%Exm-KcV0iYj2yZ3vlhBiU{N;e{UhKhBs5|vV}$#<7j(7*F8d?zJALP zBb@hw_?K@8+HoJs6Z0f;Ghdnkcc-elf?@kRA$_Uv&t<4Gjuel;V6keUPHAQz3K;n+S~_rU76WFfCg;wJ01Z`LbaEsxf##hgsK>I}WvMU!j{fU% zAT+gne-;?<|4EmH7rgD+{JhWWU++*FfNGxw>etSwEi?4zc0 z#@1%poD*-t1MSuYfQ$)wEIj@uz_zi&U7=!pf8W>VoHaQK?;HudPfHI)-}ODKFE7AJ z8R>PM-jB}Ozl~2bT4SWsO@nXP)DQMj>QF4M89#5$9R{pt zjWl`ceX{QUUJo47{<5jwW|?Mx1^i!q&Pn4YZ;j=l7<)W@AXr?c1vI0Cd*q< ze}D7-AE2z5uCZaHjvS0f!w~Mo3mxrUyRTNb)4)V(f!coL0&PQE7MS%@9z`( z-L*B@cSpK4DQYT;9Vhrc&bJuC@XVAxoB`Lvs!Pcu#_{mC5#ZRd>;8dO#lvv?>xtvF zFdQHB57epOOlaKy9p8jOs>d`AP26q+nh9wM%|z2+TI#Xhri9o&PbO5O$LY++tr z$;P@oj#OCgJ%0bsIWta4<9ZJ{HzB6pZOVZA1LM@c*K}t zy7YGSyLYkuCZyDRI+i@x#_KJP=$?N1MX8E&J9iY=HJ<;}d{4Qz@@IEo{o!KpWaRHA z|Ns7{|Fy&lhWLM{PmI~2qq)J%Qv6 zSs{59?-0=O3Jn*CST@RZ_VgZ9yIQ+JyLB`9EOxDrkEd~e?6cgJdMs@DTwLMd%(vER zQVQnyjFzuySpeVoBgksUg~%}SBw%4!8v&dZdJlm@2wZK(y^1Zu_S4>$;<`rU_#dSX z`nWg8k|i_s0c!Cwzp-RTk+jKq5LvlNj40SB2uAi)arcfx~4D0 zwy@kMV=0c#_$3?oR8cpBj^^Q0c8P9 z5BrXU(*vFwh(#+1AQ9pSh-^zD%2&U-UVO!+5Bs?u9_-{uvg4S&p24`4esg>*pxL}bwUTddfg_mFIG^d#<$DadnGN!Oe{fbDD*RlT5mT3tIO?Nw%@p8XOKl4d6>zZQ_+FL ze=7!0PDjq^J|uLs=e2uXau*9~ShSiQ1<97m2)gR(eBCZ0in8~B?^;(V5ppd~*U>ta zUk@gHhsK?e7uR(}QgW$K#oL(xrBi(t3aVJSi3q@ISo_6-$x@?hNTa^mdFFz45lBu6 zzlKC+x)a2O2&iB7imPvU@F*T+H1Dv}`~#=b)d5jBKQSwbSa`Vm2w2@12zIpvVm_Zs)iuv?Ok}Jd8Qi z^3+FiYAL0w0}Kkg_Du-@j(de7Czn;!Alo*etKEX1`&VZR{N=l{(k3e8dTT!&62RrU zTcKzFfCC3w&Y#&v6zM1<`sAY1WkWY*tqJ|=tW^gTiEj+~I(?a7vn!JE2rY#XKa;Qx z)I1o_OW5bKUK<{YxqNstDlHD=fiSSH>fUD`9}Hp!(%z1yB1_Kus?TldpxR{J8$?pb z>trc{CK>G5kkWoo{bO;&0GxJuBM*@g9**)l*if;$W4X`-dSFukVh<@SY-tNKW>L4NP zKeuA-WJkijml2n|bVgnF;Bu8fb7}1MSh#>bC{OIgrCYki6ND{AQ;+?gb0V1pWUJ-( z>ar0Feb05QR!Mi~p=qI>Mj<^)tgG_vZ25axN+CHv<(Oy7-8_8v#kPB(OlD5T^AGr|!*O>fWy)_t=Y9XYMiPIV_um2M*}o#w?26NjEh2pK6!tAS zOFZ>4r0;)MWywbr78J4yCrJVZ>Vn_}INzQl_?iW)2RA zlXZ@TYR^15&nczs+Stl?9`E#dpLKJMLi#pbv4HCS({l2pbOMYcGXD9rnTpa+N5q3Bo0*TEPvrLvw!;KiO4Z^ zd|Fg$JW8GPc|#kVrdbNuCtI#9iPjnPnu%~SNCw;GLjG*`W)5TnBdy9l>JV z?s)dzUt*oasu|4+aZX38&9+smP{Y<6eeyBAhK3QCfomv!M{20zZZXGS(+c!hAw-%Y z#}>4YqL(K1H*e_wP@^z)Js0UioRcdYx+xxI!yYXG&b@`4D8lCha7Cea*d#zVxat&goF~o95>+P8qcikdWp+fVvLq_SQcCod7pgR+2C`{u{0HTr^Zbc%^n;KOg{J(Aw9W$*(W~2 zZ#4bx^w-G!n&)4#0H%eFMW!Um(Yj124R#aUUtRD zr|L)K3lpJj-dZUs=;`!5C}siSUSClqe%tTLMvVI&zcr$Cm>>kywhQ+u&HN%flfzmt#!&!%F`*!q27-E{du&fwYS6B zcFE;4GN{I0_u#zw$lis#-UWJO*cMUiS-08oX6*)AgLZ)}%SWakMi1Q^&`Q(_j>;6V zP%rU`03AfQ)7e?Lo`QO&r#_b69 zsXomDiGe=~R41?W%%#xhQ@GLSzh3p#((1Jd9i{En^bgMIZLf2EdGCyYS!EQJq?s;R z{VsDW_B{QI$@+{iTi8DCi)x&nr3Rcsd+pJfdqAFan=(n(Oc^{1c)Ypj5*FPO^pOB} zMd?vtTSX0KFU{iEJ&y^tDDFDlQhKoxeU&LamozC=`Mq3do3(C95y*K^d*N z#HMD)t8+Ny3_xgHTUb7&jJBTZ=o(jdwB2R>kX$H>#ElsPqeGYFwpPX^Du zwHja{OK5V?B(*YFN<3*xTL%E4(+XZesvoNHyF8+8YJersK#6%H?hP3hdC=!R||RgT6|=FVpg&oSaq!n10J zo1oMQNf7Kgy5N|4OV7K0+jl)p{i+#VQPU-Zk5izaplp6d!Z9zc-~#J)jIV98K@Vr* zCYMF|^S7ZyLgQ?$j(-Pl=QQb7I9a%pDo;I6^@bzJG4>clh_V5hYQ3n)WfHZq?XSlX zxv|4=aKE;?f=~Q+4%Z0U8PkGA3xY+UF2#CisPOx57Z)nvm{()5f;>oJkUImVL$#-h z&V#m_)GV@K7w&|6W7d z3s*;3^NgiIj3B4Vd#Q>Wh`D&R;&JRS<^1C1g}*;bhmwN!1g9Be97E`XWr%8CHE#7` zHA>C_r)ePI;HhgVn8{Z|b;*%Q*6z~Sd`9$Y7YM~&9hDAWrrxzxJ7^)OMUU{`zZkDK z^Y>@t-m9vzp@X6hsYAP%a0N7G8ouf!{76XZ#~ETLf3l8j zH*Af?^#0chhbHXqD8|Jkp;*o+ctlC2Sh99SfXyyTw6fNgH4!EKB!>)Zw6b)#;#0Un zS$1bbGt|^<8Hbg^e~djW=d`5U>MM?(#U|*rCqi~r@W-SvIp}qJa;&S(+7g3I#_UA6 zKp^h7NkDNt;npql)CY=;BGuK!I$kf3C1$>BHR|J5$WYDrSQ+H$}E`xK5LK#ecA0XZbBZp zCb8zh$*=>*3YjzFZVRIPXAm34bycHyj&lGY|zf&aqxr56}MtvRLec$AAB z8WHzP4U|CRBW05eGsITItv|ELaX67C13NLipgmY9Q8lnZ#%F(rUX_>=TUkHRh)khx zbX;#_au)rgJb~v!-n#7f^DI|HQBy9}uz&r{J_^m@T;L9HjZK~JvT!j44ZV^f*s4!% zCUQ(fU6r=caW%CWjQPj{okto0|jFpD(7eZlB)n~cc;Fhn-GH|qAVE(n=rAQ zg!1?Hk2W>#lNeDB`szn0(jp;v( zU#cVTQ3?&(Oj`|S6@nw%?-gdqXncOkp8S-AI%Y)n2rqcGn##qrNi-{xLD3i27~A$>EOn90RIa<52 zwaerm-Xj#}h6SWATgC=_v`EG#aDRu+743xyC?q0^?$K(i(V$K|Zo7Xhw4}?cHBnDk z{+owCY7!E>_CzMR=ML#|KRd+@TQmqR;M(&(vjNZ-J0Y#q#_4j zPc!Vlp|p~0^81fB3GG{*el|F)HA{jfq$<%9xA?v8qjAUb=uSS2rt?mc~wSi zp=+J>P&9)KKQ>Tl_?(?$N=ubt2`*I5(dH_-c+8HODW&;#3F}s}TK3qYmEsY&WI#cE;6VkY~7aq%b3Ra%? zo=w zjl;p~vV#RnmI3PC`clN;Mb0FV%XPQeRm>gdCUIw{q0#k+bB6eF_Bw6fo@~AA{r{L? z!_y%Kya0X$-pACjoAd|P8I)G*V7KgcfPrD18tOE|09EX$Nam>QP|eK;_jU%>Oftnecm>5 z&{Hglsqy*Id+bezBLxX<_3lJp-zQ=|FOybB;B2DX_vR>&r@KC)3i4v+E${*hwmnqM z!V9!YC6#`i`g;WF3MLcXzrWsGbbvc3tcQ%lk_V}_a7L4yt2EjJYV9R0kX@~3HTNo7 zN5X%9Q6jStCb5~>rq;el&wQIr3%2J2I7x_ywe*jh5%)b?1|yJW3H;g6Ds)`;@&;Dim?_in|qw6_2Gu z7hHBl;&(-ETg$Ea{n1IJBbLb+>~$|ZIxI05&nuC!1s%GncO#nCl9}hz6ym*QjQQg~ zWm^xJ1!rv|%h*!y6*lCaEKrdBhd(+yWr^29Ky9)fvc^siFs)T)X6F5Gn0f82+woBI zyd+D`Xe6$)+I-)DRbl+;$|di4ZdfWQ5Bnxxd~GN1rU z7%ju_n#Inte548?9(%fsAhX#lj1qr3W}_5WjcdDX8!Vw?yBQ6NGie6#Z4CFpV(z)P zx!Ae&+=(AbJ&#Kn!@>el^9t?R$U&b}VV#+WqtX#m9*?9WX_ay2lJKZNFZpoZ&|?jl zhZ*IgkKX$;X=3Hm$Wd}ur}SDJw9gtaCX&}D}rdP=(FnyjgMKDI3u$#n}|mi&>^I3zPW4Yt~*dw>o%E>5Y>s_X$tC_8yo|UsrK4t_K;hc6Tr*pO3J5Y8;DmdrF zz9-zY^zrTF01$0IT#i9*#a2@oJ!jp8-^8k#JIzkOGu1x*@eL1=}8(>$D~6Hvb0IsGz-!+-URuPCDC^!(#55 ze>|sYICh~76!(ADPz(QN=P8pUlKi!NnR3kD1nH;-;e?fEEE7z6*Y30LiTv*N{ceKc z?&d#-?c9_f2j9T_h^jot?P#7`%U%v|#oJWH)1HM&`!poz8&B8GjHb|9hdHawy$4%B zOntmtXtR)JW1^0+N#bOiRsd&p7t0^B-x<>~jXURUvmm2l&l*-)>*l?#6O<;2EOC|< z#qa#uonXxY9Oh$01moe8MxUYe7aYK2`p4FXz=0C6?V<0Qb^Ak53aeX_HM{)#-#(tp eUFnP5O&jxwzyHvhaS&ky3{mIw^3PoS?f(GlSHY(M literal 66753 zcmdqJWmHws_b$8-jdUX^h)RdjA&4MIN*ub8?v_3@f`m#564Kq>2#7R@bc2L+cf1RK z@9&N~{vY0N?}vN$fN?nPSZl8}*PPFM=CgeNMnMYa9>qNff^c3-ODIDSk{|eWz(524 zBGP954Sb+DiN99G0EZXG+feWw(_UKB34+K7?|xxsslUKo(0yk~4QCZQQ)f3rM-#}+ z&5hl{*2>A)(B6dI&e1GsN0r$1Fc8{CbWP2PH( zG>5oubl>k8=zBIO44yOZ%`7GX4kamW7g%U0V?LFYuiIM{GDL13qr5Ie4-QMOuBj(a z>FEXcLorGoZd;i@{ZsIvG?g)*6N7<1J(&2y?bQnlEY->JkN)lT_i#==M2~wIP}1qc z&;p1}CaC1V(}{x$|2<655Y_6~p9Gf_;)3=D{w~{*DQ6-f0*QlTaFm(o9xyrk||@>ypD7AY&{Ytw{oO-gB{z!wh)2zVG^zI*6r$gS_7Qtcvj&A{~uV$`tV*d|B~ zg*W^q?tOw!M9HQ9ZBm0sT0#Qnbz0?jI$Q)Y6I4{;V)%?%-?&FnNF?iN!hby|B}Z`V308`2QA$4%=0ztWARv&W8$%%^ zP-7bm{7LNZ29`sPGTC`p3mCn52B$RCzU;A4+upO#2yiN!Jl=INV+)Y|_>1x(0YNABL|?!A;o79gDT$DRMViLv8BG z(9lFN6$N5x3F!ZAhwwk`l+j1$c9%W=&vjhD4(RE_#P0_4t}B-)gw#S3cQ%v~8eKyF zo%ij6{z!CNd0=c_pH(`MWRR$NA?AN>0iTvyA1$e4m?Md(H^r^SSp9Ahw@pP5f|~(5 z!@?5DyPmt(Z|~qfBb!hc`^Y}yk+&5}bJ)GeXBZ5EuP|b6l$3iK*{5|RoeQzNTYf;L zsMR*m5|k#*7brQLlxB`lA1^ZLA--xim;6hsfad2c%qTReymwuYakMss@%o|oNX6Yt z-o31vU+&+)u{|b?-+f&=dwnLQ8uiFf<~K1J7!&ee24B;^XY@Fz^&DtW8d+EO5)ujY zqk}lbkf@YT@?-VJ7<`(fOAtbgszIGg$Ph*+0>2QJxh(zj60U@e(Fs9x#2IJHEwi-z zO93sW_>3w}KUWRXe~?l9M73Hj!r=Z)_jre1!${F;OPqiwcp>R_Et?f(NOUiCJ&-u` zfQF@orSOYJ`G33mI2v>n7wtU^9}nVmE)Y=r7#f=E>Oaed9mt1{jvfUTrlt6|7ilEl z?pBS`YzKNXDU1=b5ts7Q;Htr$VK8=3pr>b(Y^7-nat87Mqg8ct@GDV^QM(B0=WG+@ zm5|TbLK(hEmFk9RgOg@lekCL)PuFNE|8sMqSh{fl(%}fjlfcLH>4I2eP%yh3e(+su z1jFy2IOFVwzIc!n@V_iGlG+*#T==T}e^}W6w48u{R{OsU@c-J=no0GL1aXt9HuhAO zxwL%gw+^#~&;tBvyphRRF6jM$kL-td~P77=&qq5Du_ot~xUb!JdA3V-v z&zv*#4s4`xj#+apggq8t^B1XGQ4Mgk3JiKp|Lzoc3`Q>f6`zBc=Zw7;-iM7jBnLIw zi85<9g4LqQh@92JAHm)Nv8zs}9X_?oh0UucDo$$4lXhdO9s`H!DIfCMpMqGnn;68Z z{>`0rEZ<;IHa;ZQWawycfhnJ#+&ndZF-mFx8R|Q;c(`-<)*@>hZ>6$}Ub6}Yolv>( zy-xeh-0k;*S_KA?O8(Yw?#?o7h??d1QYAG#tGX)yN5 z=FFzI&7)zbn2h6PVU;b8Kb}uDRmpYjw-g!G7QswQ@*z6ZODAD@-56t1U?i5@-Ii3( zYtOmAPF29OEZ7C=p0QKJ`K_%Mw|S?s2GBhm$Ntf&5&b9hS zR8``d#Kkk(MEy?>acdOTBc3$tzzuy!B@@;X*LAhQJ`fo9mZbA{)iS z=gvl6jY_**4c=^SfYq=|ee}lbHaNogvE>Yy?{#n54cfzK* zgX9KFKHN1D2=cnvA8wp~GnoE3<=Padnzo0pCMnNhef&A<8n$&5dANUffRGpgl1fUd z{+iY7&D_#GE8!A~S(59iEC`}G_mT|Sy}c;>lKG#Z*!d7ERI&HHu_{(?85nV1A`V}~W>EZU zNxL7FU2+dq-=IZQXR_I8r>N*nhQc{@f!bS|wy?g3$0@tNwnNICFBd%qYv_1>SLD98 zQ``PtlGKZ?)-IT1ydyf`|8OA83B9T-w7CCacES6eLED-P>xCFCm94a6<(Gm|BR&D^3Uq;;4v`tg^#rH7FW(U`?lXBPw}lETe1vT zzUXB@*Lp0_b9`F#Uc!@AF!Pb>na*~ZTBU6^JsVx~4|i>1KhoE(mD1&Wmh*$g1R%WT zw#~Yl(jFdf@3Iw-<@Q#2mhbnRZtmA$HW8RrEdbM7@*9kUC*0Jqq3>@e&hls2Y3qp8 zQ|evWQuFQ(7OB+b3~Gf*;>r25{qXlqo~Gq%uF6$7M~zsYq8ii@q_V)|b(&XqlM^fB z@G|)M&5Pp~(bhzzD7yN!8w|}^y6xySGH0`*cE!uvj+~aDf>IQ-Bdc&s@2NCdEVBvP zfn+J2geF#^!G7IyZ>J95FH@o+~`Z||pO8PewYFZUjOd@n!6YVN!#yY#a zjKerz_jm1>ucJ|+Qw17C#br$h-J<~?58JG3^8$JlPIr5$nRgzF*t=hCeMDuPky~t@ zhmtjGhnJ}DFuTolz_$DGtX1@{Xry$5&c-ugwayY|+-?`5NBvsmIPon9L{Qon%XT&* z55t2rT+;$1i1^&OaQbt^V7w{lvOND%*o^ZmyJY)acMcYCOHnU&SKo#04KLZe+p947 zc}zNfai6LLz%`|Sjf6OaANAEk_Gfr&#y3+UqtD%bL{Zvrp3UJS)!Liq$OMSB(pE<7 zj-aW9`(HlB>oDQC)yOI5nDrrdQs4cdc|C13wQl>)a8AoQ-`D^XQt+e#GmbeuULfyS z=S+FJlIB^!AT6!KnfcIkjbKXq3q=et#v;;8-f7jI6NcW95FU$foJcyn*P z+kA^2{5BJYK})b%*_CJ_CtJFB{+EcW(8*(>4)kEeqv}VuZsKxb&T24^Vzotf;(4%I zHA7z3Y0A!rt`AmHhW0s?d!FTtb>xBubGT9@y}OU&&@8K;^1Qv949|hTc&)b;2V;Xt%zaOQKh^$m`U?2> z`Ql049Efu)f1zlzYvKF1PoDuh%J|92Hqgb8u ze*831sQm!nFL8%UL~AKSTOmqgnbNGvz#v2wdn-C@R%$k(>aBJi_seI$_D6Ni9t_jy z38nlAZI8$~?Eqei>tc~()1{#nZ=OBLY2&Ou<72Ad(WSLNRh2P(xueKT)lJ=fsQ&eX z!e7j@!5Pv0{ZBR%$t$m9{Bg!3|3Q3!8)Nrqi~$WXR<@^*Q-bK1VUJK5ppH8oeQ3Ps$gL+i_w8giCr6`bi-SWfd( zO_M9xq5G(h`)xm(cim9oDfsMRZS}GzA6kP5$S9t6XhR!U(BJJ!MmMb+82d$2s%)N3nmWz`!(K5Ixt^@t6ZsaC5#-8(O7h-+QLFJ~Q+Fa|3Tzx;%P* zvRyps``WWC+P(8%)Q0DC$8CfYMy$0rziHNonK7TZ$UL(`zV+^Y)1g5O{5w(V4X^cH z_sPri+U4ucpZmm1z;;L1X$lJN;GYrez3hk`OIwafN5-w%m{x63yYI^?AqH={RFCXZ zl^4;|thz3e8u9*kaM|_mXvOEbCA|gCmji0OXqqIyT&Ma&7Q>t8Q(?&$XDt$_Pn!_< zVO3!Ugv-7w6Q>Saiu?V^wWdD?-`Ne5I_a-2dNGO{^nX^tH>(okW5~Xm_v+K7)?1ao zbSks(W!Fn=!QJOtX{X2$A)TYA4Ho4W+FDx!rsDwMaM?vS6NvkUQ@OTEWGIdX7+^L3 zi9R2U{4?&A7(3+TPsE~bx<)`Yr?*}*n=n*^qwsgQ{Kq)!CWy@$KXYDU+;%j}8(tJ< z8Tz*RV>1<44_y8^-CpgOUiPRd-mhx8bW+q9BHD;RrR*Pn*~7i5wvq&I_>RUFn6Zc zdd=8&95dDmHcrP(v-GRLrB02f`|O%2|3%cuOKS zXDN&~=t>JQQ*9T*SH6rY(0*+dJi3i+opBP;dIEQFb&u8c?E2~$|}Auf7`86 zd(GpCN4tB+HQ_LGWV*I0_2XWA6VDOIv+OV)^3&nCv##5O3A_NIti{5cZTd)s!uMUK zB^@RY>%}TM7s|qe7yh7|jpzCr>ux>YC@ycmID2kmzmi)_{^?^dZ>*e|%5V>H?4P(<0GIfb59C+N|THP;)P8&UYT0JO2~BCkq2msiN+uc}fd zc|=QNM$`Qll=v7x!WVG)TvzLFQ`5;Z+B4{@vMEB^tB@rA>&$&LC&h(?LGsns_y70qICYaGlmOZsVDVlH!BLGoXn znbq?(%OrOh*_&@HIFHQ*vewbS*z$9}IcgzByIeb}X|bsAW|e zj|+dQpKsjvNZU1%x6%pE*NAQJ<11~|PjBKz^dGCvq3FdEl*H!UROIR&J57?jJh&I; zz+VCtxS7B&o?~d0ALFZS_le)0Fx9Tc*c!fAK>da+PWs!XNk2zq>}$r;0)Hk=acW%b zx9_W%!Km8Do{PonvlS#!p_f~B5XRhAz_+S`a%)W|)Y@IXMv;&d zkt-9~u}*}>=R|ZYC|DRGOZYoI*y%knm`mJH93_5VxQNayEB%HFJLV<)hz4&biknGr zlp64HN&#ApD{1H9UgGoBJX(?l@8VZi!=PR_Dt597)s@cOu`kJOw2cODlGQk&D;~!| zuO<*o-)=r?f0oamVaO!DaG~Of1d&=#L}A_7J@=V(ez4>uW0_9Ev!P-hp|WWjJI<^= z#^95FX}-@}F~-R+mZ{w<;bhy+=Q~HvgQN38>u>>&rq`Q2 z&pF{?y|v~hRhoY`6mp2g=@#4agHU^qb$HtA+7R{OQx=ssTkS;p#YhYK zYzv88AEo5RI(iSszo^vB`0C9f^$tzNOcm+XZz`--Lew$=~4E$`jR`dPBg_;_Q5}8%u!oWRRrmplQtPDjB@W|>!TG!`gU>{P)^7RqfpzqxU^Zl=&R-pT(3 z)8&iWgoi6DoBVU!+KrtpK8y=^uFbdOlk3yPnn8baa0gq3e5sLJ^_T72lUjb~Vlu@j z^6l-du|N<=>&@`Y^$Wbj`>#wY1eD9t;rH?%Iy-MgEAXVM?}$ZqeI>I6`O9iz(KkS9 zt@b?ZYA{Omi(e~2Ua*_I(IH>90fe+j5*{Qw1Xf#*zjv%7sx%Aj|kG%u{hOV0+B!`_ktSwpj zqx*!3!h%&vrYqjwxXgfF-23nlvpewO<=%xN%wxs_67V1j+5nI)FQk4(YrRh!4#386 zk%>{(IW^uLGc&L?j{EZIFZ<4)?Bt_GsH=rDfMkO`{guq`_Qo@}njBDS$i-MF>x2>9 z@q=8ZKNKms%C)g7;@%A^xkZfa6!(C{_}xLlXsz1;1fa0d{>#sMKeu8v8dn|!La$t{ z{Llv|-wxc{`?=O@#lzMQztu%N!c%-D?!^i+I#RP^yAS{={6VJ(tg6zC6NBmQ|E;}< zv8}yqpSNnHSKu}VrX7(!VYaYD=G)rz8GR|T;^F~Fsjxy^!)o`07gE#COYGRc<87Xd zolhwv;dxbK>(9EsN=@wX7izI}M&|Ffa&hz3HCFRB5%%vsXt>lOw(S&%u%~W@*)gq(b z$ZMW=eU3R#M^wbF3@=CIgvZqcF3iQTfcNQ!OoNWDOXZx(@^Eac4eM|$xF4?Z2TT8H zZ#3s&fu*+rlV8Jg9e+y2U5lN9{;8j-H_g=HHB=p$FQ|vAByOPg}*%BHs4Bc3io%BA+mdhS)L=(fUM z%FB5C>C|POQ*|D^nQ<1kHbATaB5k!9FdhAeFOj2~q-IxdwwPae`c@e|p>h~=v|3p8 z2FE1iC0jeAKqh@_e@_zSb=KERb2&y;cD<2 zipGVw)}1%=@aUg zJcf$@G9Y%UDNF!mDjWA_v)L6szp>$(IRBm{9U9dc-|J@7b9hL(QXg?l!WmIal`kxiXZXtuI&W$64q$Nel^HHU7c}x zOyyu*n){wUmF5^`@XE<|UJN=jw3%%nqADq-7#-Dd-Cx-q=b3V$+!R@7GJ-VJLFH^c z!Ero9c5jFNR@f9ch`k$ms=u!nZE41_GU_E$o|O7t5_`FHp>$}nab|YN9#V3g3B+Xz zEjyeFTtkp-4egIX$`d|^_v;!RQ4n_Qx#RZ^j!gS*x|{bw$9q?4n=NXYauEJ zX43Sq(i)!s)JpHlsQ+7`^RCzm>fE3(_5bH-1D=WPlN;QSWyhS6$#3t&@Hg%G*~huJ zw;tC&2kj;s6n>+`#Ki0lHoPKf{Q^CO7+^nt>9VV<858%}*3^7>|GsQx zWn~*}!Rmt*6OQoqf3*N=5*QBO@Rw*(2i*h^eZh@tUS3{F6B82&jg5`T5H@5ln2f7C zJ|C-GjJoIYQZMzcARC}blxs01(az}#y(4aY4V}O=7z2Nza7tk8SH?JMi;I6Bdc~BG z?v=iIanjMzF&5#_WE|9Kbgnmkyq%YqcaqJHb=URULTJIB|FJZa8S==QE@PBtqBk0N-Orgc|db?))v{<|I2y$6U-*|k1;dV07JB@$1--2}Lp$lqNr zoZ<#)1mfR1*Xbz6$vdwP(cvNPgJF|}ERG!(K`$hjOv;NlZEL2zi4l4u!rbz*O2nO* z4}|a;;jiJN+`8j7sHmuyO3hy;SDgj5$%7Y}N4AQ_v%O4Ecj~?jt{s9F0(gX*az7cz zodH^_tgLJd4AC__6Q}$~S62=6;NHD^Cr~sGD;{_^>Mr7_E+>*Jzu8yuy~nC1??q?^ z>7u$Scx#I+w8-G_8X>C48u~0SkCxzzRidklLA)e<^b!si`Ez*g=&U=ByAa*J_;ZBY z#@5m@f5TZ&C@;UBx>XF?JNLqZ!)Ix(Xnz`q<>zl^#Ylmxvo}4evn_h5al6ijHo>#^ z{SIlQrKQQ!^Ea5G5LVJ&b~qf*GSOB3fkk6}K1V@*V#ekLF4Kyk{K%?h;uxzv?P+d) zy&`<{5b69miB`7s?Y8UX7r)ADRrs|Sd^E$)bMA27rKf=Gz* zO~@XeAu=LyIphV<9&#`5{NpE2p1d_P^IWt`Z_MPt4z ztE)RGP*VTcfMt~}Y(a(C#M;NFd6G|Qr1JJP(;5byo{M1ed|dafDqTmDZOvs_^ij)e zCVe9DKBQ79jA6bBl&>){M-s1JcbhV;+*in{_&i89XSwBAXNr`dd2+)o@A*o`0=aZ( z+jS5I>5jL&fgcwY7P6|FYvqib2NrMkX2zUmXJ$5jijA#ev!Bm^F-&crXh%^AEa7I# z|8lu5`L+hER~z;LrL1!rCG<(U(%R3RA- zB#t%J=syKd|N6&eT(w4uLz<8Q`43u?XN2Ngn14pI^Bo1i8dvU7hrMMAb5xkr8i-kj5wzJB|?>6Y~dUh(xSxd^1`7F+Mu2G!W#t_02b_JyoAv|4l z_04LgwN>N3;D0A$EMWWPz+~dJVC69RvWnF=3uz@8>%j{d_p06%E#=2x`J8iI`Arn< z@9+1|*t%KOOwUfQIN~7fy|O9RR#p9j4ADshz0ls%#GvbWjs=X2o>4rs1QNq=xPNuM zoi9Ir>;>f-WU>b0Z_{s|dNdWx*q$1LmE%k&Va2pU1+f8NqUO*o;^!r#X5Xk{UXMn~ z5qO3qrcKu&LKC%?QI5Ig>ZuRez*y}YW;xK%Jp$KtlCa1~NofLG2K7BWJnA1RCoViw zCeGqYLA{GCX2c+(e2j6-i78Lcwpe)_hMQ%f6(1NF_}PE@E`lx4_6?)1Aphh)8-Gw7 z)4L*=&)9>VrehzE3O#!C2(yh^JTw&&lM!GOS0jbggR`x~Kp3IDN2l;fxmk1gDvUJz z(V1Ki4mRe|CZ)csDAMWn{yRc?DlBHCQu3@r;oxCz8}h3F;qt@7Lv?kJC%}-2V~HV2 z((n}{C`h?yS{0s5vc6_oK54zJ3ip!uLU7e!d-&Ijvj* zFw5WE+nVwt^>Iuu3u6wJS~p$jE}y}~lww#^l$E)m(&=qlqzNp8+H8wk3_Uyqe55jG zu|#tkpPXsN0Js~pRtD%01XmoAtO~6nN{~}^O>UpS+OEN(>s~wb#(0poCE!<;_>Hk= z6k^_DF(>~q;unt>xJpe+JI~B1jS8hm9joEd-AR}-ATb}Go87SV4e);*AkPgzqAiz^=FAuz_5wbGwo`!~o&H4GimiKFe@Ks=cWSCM_;VwD(`J!NH zY0J1%B9Q$K)CEW7MEE)OTx#?4L*Vd!_DlVd%GUw{0`B6WYp@9t$@7=+(MS3DH@TY^ z>`W%K5QpT?OOi7wcG2|wd?`3QZ}TFu{C%U|t_~E1Bn>Av`2x^^DqQz%nMO5ioh9q* zvTH-CdD`U@@ZQT`dCurLclFdm+PUBuT(BD^X+qc*nYb4%26MB7!A--1n&~D^Iy%Ej zLvlCRI;Pk?I=xus=Z563^yws!5TXEN7H3<0&WbT!(9fc$7JDD#_#S@=y=MAa@TvfT zayV;HGu^-x_LM-363T-X9P8pby1Gu4iC65x(!bJYZ9%?8irX#exFCQJ#Ale&s>y+&8=8A-1OC4(+zA(M4FVGm zQ@+AHe=TG)k^>k^E;HBS^JZa%4s!P!M`cXr(_~&5xN{WgM0X_Qon`%30Qo=kXVFy~ z3}F3q#jh#|{W!5SvEAj5iJGs0bH5~2a=ue-I{SxJXxX12LS<`1_DRndm|1_sA-nT` z)qi7hvHT)x>~(^BJGs;4tfV$bUSfw;Ul{o4+tnR{m6xnfV6`;)O~u=uVIj?ThyAO|kG# zFx-%gx|jnONR27$$)iUr({~`HJ-+@1plounnuLS|+`GjFPZ5()jk>Af_F_5!1+w3P z?~W{!u}4XbcW8lr3{O!2v!?mb%9yN7Z%cUT`@__dT<-qP_Gb@jZAm4|UST1HhlitM z(S23!sS&0bWNW)-S063=n$D>HnyCmX55n)3l$2C~8b=YGMP%Zk;3NEYMPJZpoSIZl{H&UTervX zb*`YQ&UBCCJ<=p0M9`A8LdhR*yx);L`-Y;wZPd? z)b?FDza1$*sq6SVs9PmW(_U4ha`@D+8`TZc4B?(CP?+1^mfCXcT)$;PbCYRj8F`g> z*`ifP6y?>7RP>4ZY<|A$B}SU!wCU>p>j0rU5e3xhd{VYJ}D0zy%VN|aS{>|KE-Z-rz3O!>L(A#*ixVe zfoE+ij{Uh?NQv1I(aNsKdl|hei6FDlf*x|qZ>~Dl@xp7;^7B*Z18cQqCdtJ7@R#@` zm1uaeGjd`M%EfS28Vx*v>s7$mSErNky~})(m3tI{!T7?6ga3FUBg&L}y0T)z(iz>5 z$*~{Mv}QoBn8OZ)5gEaT!f0tY(3w|r4IIvf`MfJST??duMdG2duu3F*F^v6>y(=E1 zelYI*{74wcI#Oksd`ZQHF$}C15@lgCYpxA^C@J&YOezg{VK# zFjtp$w&|w5&kGF*1Nw`(4-}J~GYaydT{*tAA(0lC(AC3FvC*!PK$gjMd4ehJ_9JBF zA<9q_oN!0f4xe454od+DxPagh4Fa?a`Jk@el!)M1zA8D29#^@nhI`6|2M+`)muYql zzaD_ve1_?{_(;;}x5?=INeHsQ%re;?eSCj@#wGiHV~~v6vOQ2PIPA2Z)c)Pp-#__& z;QMJuuz{AQX)n{Ue;^DWOlH^Q>uumHUEf{jMAW*k(o5f%X1`L6?@lKHaF|rQ5&702 z7zg4-0zh^jMBrWIH`wi=MB>gU_*Z$dgyPPu6(!=qhH}Ud(@F!F)_0<%w_zKkzX7a9 zTaaQ}c?W#~HR@n4nepQl<(~IYHU^y_`ba_k1pFUft-oNucY9ks7y>u zTAW2Oj`#>Bh#eDqi(du9f%pbd#DT~QHmn?RnI)}VGssCgXBxS@l=n zNIs_quK)W1I^tomcwt}PcfhMTC4L2pP{mtbc2?+WXHcYe_jA@e8^v}@Cb4Frt|127_uVhiVUGB<6hmheE!`2XMX-hI8@Bxq@RQI4iYOi zB+mgU(2C!s%+a}*K18kyQWWAV9*4(t9n{c{Tf?j(S`Wirf{XTi%w0I8%5SX{S4^>B zRPZq$A0HWH&8sa`Fj}vI!A0IG7~c8oAqR(@5Ol}giHAyxhi+Q)PJa#x3ThB97LW(# z*Sk^=6N5qF*%o%MnMz7ESoLAjT-e5;fKS1M60>nWE8oy(T}fG5Zumxm?6`Q~)DI*+ zZppJ8*4hMOi6fD8_bvGIR1)iamz$}Ck6Kn%1~~SEPK_H(yf~~U;sfpoQ1v)0n@@0J z)L+8(H)5G6mgCSub`9BLKWbK$iD>b3IEb0h0&VD17Y(%MXZ`a5remMB@5(^Wwt)t5 zFSX7$Sl3f0mDj!__U8Vb*kr|$y>}{pyKU%Fv@6$^ZLj_4{pt-zoXJvlO}i9&sz{oy zhCz+lvz8Mf7z|n?Ba*TTTdj<_@}!zh>e?N&u|iZESpeJdxuZ?LX$j=6xbx-Mx0sV#kJA zQ#0a{@dg?AC?4#p)3OQ|3IBr2OW3pDr^YWy5h(NGNk8-XW?OC z92P8!?;?gVM};!fw(UrPc_u6ga1xX z`c@J;TNuCC_HVj=^W4_Uihyv**dVIH$i!4l!)d#b$FuJNzL>`0vjsb5ys+PW+8XK0 zOYuGk3bKHJIz6hM60dCR^b56uRRcgWzmS>4ZCwPZy5oz4-jxeNFc3|H2lu%(NY)$T0W8(T+H_UUra{OSE+Td9~5V1zSa2h z)LSc}Ty^D+D}=k6J=l|2_>vTeFPgy(p+JAu*39TRUssP>CWk!DjM0X*a(>YA)dzgw zC_r^UW4|W{?ZpR>7y$)*38z(6{XvZv95YjrTVO?GgU5jc801 z$rq_UpDvV?$pFf&2=@D(guS@9xLG^rD`;YE|D6x6$m;$(Q$V$1El|PXDzmnyA49*K z^-fTU1!a87iZSiiGBPsqA6y%O-igpCcFaE(50ztD!2rIazVD_-C|xxm=IPQNr=*gO z-^Wp{b)R<5lULHk?X=`!22M`*i^bflFuFq5QBIu=79<$8Kac>qCHY9jqr^PkJ58kb z6mL6#qMRqg0uxS^ODPXo++OLKWY~mJREuOw$r87?>1LUz*l?MT$ngZ^M3LHcrbXTm zG=$;vXXtA??RDlgX$7I`?%vi{z3^wGLp-~zYOh1SewdV$$IJL#A9*7*98*zyb><5e z-!gzxeTWjETn7^PR-pENv-PZ${t&}Qs|D|_U`cthg2*i!WyXk6W(Hw3*ZNaty4aMK z?#uCG3vmosYq7-?EkTa~Dn3hABTDOGED2xq9N~Ui3@O3c9}DtVrKXkNnGUjI+Ai<) z4sw1F4OUZMk2g8x|a&9no4h7-OZ% zN!=5ERiIeW`D#W#mO<+4eUt&Ma_ZaWcZ`L8Oqv7tCkC*Qp5QRs+>0YTvv;^2`~kL} zELF5bwF9}QmCMZ-5RNb0t}gd)Dqo4v1M|^o!CSsOyZ!xjs(^>eL5=tv`>RM=rCT3I(uwif z#|wRljUX1OKvcx$sp)bREuR6<9Nv?42Afs(SCWaC`=($aNW|u)O)&2DWw=Gg96W`b#G@WDFfdRCJE;9+r*3@;Jq!yA zdkn+a4A0HCq(FRzk=GZ;7bF)xCWDB{$hwe2f9uFf{C?{wEXl~Y-$rBpy^;~BpJ*HG zkt7k&$qpXUT`w5@ISx$uS%8@K2h@y)8VSSqG`t&UO_Ikga*ZQ2;aI^jW1_^wm-ZLS z`XR^2Ta$uyTJClCWcS+awEJIh0K`K+TqQ6-JpnU2v9dy&A;bO~=sR$M+)%;qfXov) zH9%yJGIkJ@&Zvltu`NQyk>$m_=YeK9%zB?{k1p#fEiAJ_d_+BZya9?t;-6o%PpAq9 z^{#wF@{%^uw({39=M! zVWi0(c4*BbE6I;~l$*ohw6M~9tr~FnI4Gd_0xCYopTz@%F0NsAN6IR(B-^4Nl0z{7 zAh@8svh95aOb4lzlqmv`j1;-3N0@YT4*XPK^7GkoQ5;@EjfB$q$OwS7pqq~!K;D_3 zf8n2>A7!3c%{_F~s5pMCO|M&!_#F8v<2@X%o0q?F3OK|F!9P-W+XG2afljim!Izq{ z+V^_Snkr8cKVgNrBfTHJZ(FsWl-EQKKtCE0+N6}^Fnt<+vJyR!e4Xb&)Ug^U2Z;cI z|A$=_LH6C=6`{p@n1lrdThc48d5gv?AD(8&B|e!l3}@QDHwShbc@AvT#=WZO45Yq2 z7{Hu3>r?&HL=$&*xNW#t1z%z-0a$E@GW zkgOQRuC6XoPyrF2MvlawQ-db-CC30AxCu%!+Hcw zPgO9=nlm#q850JU_xn`fRicRLK>Vh~RQ?MSx(;7R4jp1LvEOk0>)OCE#%=S(kldF< ze1?#Jhfur`)a_b8rLoTwUwn&DybvJA#gRc|PV8j_zLB@|93YWX!RmqdX~ih%BtFIb zdJ?E%Nd9BB+#Tg@th+1V>PeJ)!ctZ&RfVM%Z^ZOIh8zKo%jU2`rFXlcQ)_s}f}^JL zOH8YGDmzPjbA8I0(W6&4%dF4l$%-lboO7tb-X^JXiV*E5N4#rez|;^?WKugKNBc`Q zwfvkKSz8+nRW|qLlD09YX+|<0-DQHjJA*IOfM1$-B@cR3fB#HiyXY#eCL1RO!K_bvyN>NY^V1t_>`Dl9%L}`5TJR4`8Jaeo=Y92S;aT=e}Oa zDQCV$a$s(vzo;8&jBA$k|E~J@sKAi_p142wx{vzZd_Hq?vs6m*99o#{4PK7h zZ|d7-8XW8biVn4eK@jBjTF(RW8|Q#BZ!!OtolzYb{Vd+3*X!n&<0c;M&;9h}dC^e* z+l7ggBF^2EU+nX{>{I?$W>?m>V1E|%t(!#$kDu99C5*4-+N#czZ4+q!3=VH}8gJr< z4KHh``(8#hm>O!VIrlZJ6_%b8vM2+@=6~xIa&E;^bon5^`c6P{vp|i6U-*Gp%!2jd zX1S|9IjC))Y2ZMhMEHq-Zef($h902;S%ad46@79NI5v+x@DQ*E1j!3t^?qP0)&B3V zqhpAKe8D$d&@}_0bZ;;Y=$JpwEl? z^tR=IQc;0FmpBG{i+?kMy&lss8~s?jOsp?bYHE9`+yE+J2$cKf#XqRMUAJ9dB`0yJ zc7quYWdC=96cFmvS)u{hG3+gcyKJ9n+@egpDE40p^`=Zd(YV`&v-;lk4;isBP>Wkn zrBaffRmm$kDCckk{oK~z34q(^?ZqsCzM1yD5<&-9GNrb5t`i2Nt+zAvmG<JyUtZ^xz+NTjD8H zUHn_M@UR6^01Wm;UPuBwY*B4(ZJi-G7x0B*dU`bDfA2~Yxx^$d4Bn`z#bNn%C;r<6 zt=WWw4bA}e9{vBDW5)kK{`6a$4J7WE%uI%yn5UqmBY@pr3OLoU^juJyXdZwu7nzxpIEr;kn@$oMhalYtq96Tzs2N2 zhLFAm#$7!KjE26wgjB$nIl@oi@B8*`h`+oBGwTzG2JcTh&YxV@PP;b3b~1&E6x6Sq zQwMz%p7b}m3r){8e{ol2`=!N!L`oUFuLOsWF5%8Sd4Z4e0zc3-r{TU|u!*KNQ`X6E zEk&ds$k&u)xzCPd(rAYitwgeSu(}*dS*`AKs#slTk9nxMxEzOwbw^dL-j{kl1>s zURinj8Bldg((7Rv1`0o_tNFb=A(bjyy4chb+PBBpr(8; zk%IhONk;cqCw6PLHI7bl_uJ-B3Hm{}np)(JbVt*e)mS7Glth2v^M!-l+ls% zD`F22-(mzO9KO1-_If-bd@r~0-|Mx~MyV;Vk=wSlcTa1xOkQ4`NfQBX zh?Gn9(|>P^alX#|kdycxPWY>gD8?+kHfptPiw`{5%h%@I7t#!b_a`QM{xdGz+0Q<- z-s>$d=9PE)ICjyYVBYdjNo$iRRvC-_<4;t?#3CNCJlL?{li!-sY z(Hyg0I2DtWs-EpWQj?jr)Eq0+Yx0O#w>Al6e~(7xrecI~4|K-n(WM-dHMVQaztO+n z>e-4*{o7Ri?FXA_T1onk#zNWz|3-VvZfX|e!>f$zhaZ1dXl;5T>^*D7an`e9JieC~ zUPj+&%v=1qZby46hL-hJVN?c8&^qSZa&2Qf1Ke#2*81{t$INAZ4#q5At3g)^B^*{X zQmY~|U_(J;^)gmXW1JrXm-$DvtlGKzOPBcxvQ!@psarK!@Sg6iwq}c^Av<47G7wHG zFu9lu)9BB^wlQ@-*N6W?Q-)5Wux`I0kV<+73CG+jd55M% ze5r*HpD|%uX;yAH&c78i2K_VYy7_dPzSzsO~`8#lNH;@WnnDqAw3(2rE+W+1nA!CZS{hJ0?~9sDcMf! zkai*FtRKu-Cn%3$-c!YvK9yto_k$e_SV^_PJs7%?^_v6G)#RWd#XtpY5+{wrV0h2S zXb4qY2veMe+ow9=Ps&;o6uCb8|IqZ+VNpKc+jNIWFQtOCbhjWOog&?WwA9i|qf!b; zcXvwHA_CGNxHL;h!xGEV@jiaOzxTS}4`6wA=FFLM?sK1eW{)oGtdt~;`9u7K{|L%f zeCkQM6McD(9gCBI|LfU{e5dKr(2#vRlcc*&j51HY>R#!Ip_FP-yv2TUnm8rMp4j-t z_X+oHQ1J{B7;vHri+SNKb%mLk*&X^Zx;vevQsLBcBUgF?I_JoInyzC6>{?@_fh1vY zLvVz{Wao5-7&rW;BxN+Xrl5CHZOvTSd$!``1wj!{Rj!bXa*$Bx&$qT+#|Ie`N;Nku1 zN!9YyZ@S<&1_oX7eI#HkA8x8l>`@G!I_`bG)j&&i+Ql8hPWn%;NYq#!*Cy-KOg6oD z#wz&$Uxx;TEX+^fFX5KW?}+kaMApPRp3N3n&9x*R+;#N8-J%V)X+ndDEQl^;ZjK~C z=1_r;Wb9s~pxjdLiLVsI+a6(2_Np1c_qd-K%VF-HEp;P2lvu`2THUiX!J6=Zv=Z?|d; zBj3T!kD0o>t&+5N0nyuWfVK!kQ0}v{FJgM%8Nqx9ffJH_f8ot_+D<_)pMCW$Cu&&8 zKCn&>0wU|v_>HBSWYCJz*Djz3G~ZAaw0?4F?=q{svo4fVsslc81o5n}2w-u0$%5!Y zgF)u*Pb3YObStJ~p{mpB>m-;8HDq(Xh-8{jdGJ&CWSvIeqdJ99@lcm;3@$D%WsHf* z^WuBj)dpXwZWk~=8Xb8|e-5yOi)i5A=b6foYScr)ltYg|8S8(MZ8EmmA44I2<2L&s z@j-XW7y|UY@$26njEU?`&+89voOl9(##6OCI^<7@dhcDAP@E^*<>Sq#fMK7~I{T(o~?Fzdv2Gi~UU z!lx)dcrGPp4@+zFB-A zX+{bB6W~Xp8Qni^!}&#tbkbGdHzvi(ksG8IQ8~a=Uz;ItysM5PeDeX$JM9zXLb_{^>QY{CJRrb=$(yVyrI%+&J$C{x!)OSLr_0A zP8PrE_V6}!1Cs-ioQ1geznp_*b=RLnM)T>F2v*ID`WM7m*(xkK?DfGdz(mX{o4RX= zpCM?)+%1co3I5wQbt~&bc>1au=>Dm5FoP4{+nmSy=QMzQidD-8Rm&@rK{+ZcJTF=_ z|J!3#5DICw$%ZPIP-Zpf?IvHT`%>WLV%GI@v_Ou-^d+e+Y?6UDR4oFoeCngEv17_Y z5j&~p0D4wpw4_G7GoJ08%1c%HbCbgV8FsA@FQGzB(f2O`aE{{mml%wqB?El1g~0Y@ zN(gQKrb`C6C^~gZf}=cc(orb0UsxnU)248E)O9sy5hp}x!|V4#OswlPv>?T+L}RtwImpq zUuQnS(=KAu=I|CT_G3ZB_FKE9YP)qk0pg9F5v(4XZ@%|Qv`)Fe!IwiYBhXdC^f{hc z_P};!Zq(8#rEOnKF}gd#KXpQ3NYRczNDwVt+$K>WkeTRLrgv7HGtl?*1t~uO{}|DT zkP8*CYl2#Dn5VAsJ{|DD4q)$nGo#BdEX%}oSRKzKK+li$Q61iW@h_Cc;n|DI5>I5% zchvSr$8Cs}CQt4h05Qb_9|dy){PGTnJL``-L)n0(%fANVh zaRVsJlH2pWykSndK#1OIO0=R;ZiyC}G0YB;HU_;rKzag7rI-x2GUz3(Ikk z?SGGno0)$s1Ih{(Ck!I=2igH?f18A>QGId?PSU;4sbZ=zr0AyNHx3p__Wl>C!#Q^F zXN&_xxRZ5Bn_knQ7te$n?=wri$;4Xv!+4E%s5*@{C4;v3f#?M5KScR6_s)cG>6N*b zrO5MnA4a-N>V(1VfGTF8r&KXrM83pm(u{6?d66~IFMY zIBdlZzSX*Ob#B#ne#!j~^DogI9x2FLS#$Tj+1uMAUDV*p-C|aAOvIhOAD1x zif1ZaW)vNujW4T{6MaMfiybrhr_SOOr&zAL_4%>zqujOncx|-enVH9XAIawKbkL%d z8dCm^_fYdG^JL}AdUs$em~YqPV}Bgn$Tdo##N{Ua(+NVzrz%zbXZ@fddn}Og{@Axu zq37CE5y{lV+!#uJWz0FyoQSy);6d_|)1!Dqk@5;aC`s_3N`B3hAA030h#}f%hHcxD zAR#}v_T)_-Q0+{v2nk|*)4l5?m@kz=} zi2FYJQx+r}8#8)GNF?jRof53%9i5yCLuf*m8S62yfy(Qgt(I&@ES?R>@IQx3zM_S| z7Cdt^N{r?i+wmA;V{r(9!7<1QGEaV>pU9D$7V5|27Bt@HOo;JRJzXa-Bm3XF9cp!p zhki4Ue(JJF#auT^6Gg_gCiao7uJpehh6sESuf1%qYP+K;Zv!sQWj2^uH!F>wYu>NkhIv z&QB>qt!Q$hHyH-WnCq1P&p!8mQCR+zzbUC$)WZodl@gbQoQcjbezfk+`rFG&5|NM0(l?l*Nx z#Y#f8dNo1kI(4LV40T_l3aUyd-3qAY5r{#T6IPaOOT9%U& z_VXhd5Vsd{-znuyY@2KgYd)#7^PAxq<(^|FR^~`Ui^VQjxwXwXM$g2^!N7}~Drs#sC&c%^sO*zPeeu4@jO=g3@Csm!l0qj?hz33;8;H*yXEcK(`n!>( zm=xZTCq_RW#lQc}cQ^&P8t-GPmiI~hst3U34uCXL46%jE(%sjIS}BetNNd!-qDW*U z8CDG3H)W3HIXseelT^#LhTN&!`QoXx{^#Gx{p5IgO)Ilb*|S#Lw0vrPeO{iJb@+wC zw{8;ZK@80&#`Ao!0Hpb!hlXhj`ALVWmV?#42$Quoq=ijeDr~0QVP|*D6?UcDLY1f2 z5JI;fEZ!QbKd(k%B^=sfghG{9{*bq`7)$*N3i3BevuH7c8_3Y-jCOJL+)i>%k?)}8 z1XLLI6plU7Yq;aczA#!~%TwV;_Z5cZGcNiD08!yFqY~guk0J^HEKg@Z%%JcyAL_NCPT=xgk6E86d>C8``A{?CFz|!wZi|s|8yc7 zLb&!vvTo_-O)x0-q%XRv2gj0uK#ZB>Xe{CodCQ0vkK&x+BjW$(DnNRArDkPTvcC1= zI5Ev;(Os<56QE6Sy)D$EV?v$K8(E0e*oe7x>LfDRT0ttdkQfXyY%5|qG=-hH{M~AX zG-(D+I$SMS(0e18l`koAV>2Kdx%1#;^WsqrfBe5S7^e`zApi7JV5H$4W=8;-L&g?1 z^wR^HCuS?ASY@09RbK+=9lNN|2!`bctnQV5VJwqPY`izncR0>9mY&1`XaPX6tF!&+ z_)=)jSIRW^4iQ)@4^&E+=?#wmA$|FT(tflF0LXub<;2QWqv|^%`dXZ|&4QlW;bobI z5FLLGdsyJJ3G0|FvL%Tywlw#_Tw47zZ7E4U@cxq4l?dt)K=>_ezohrL@WgQ?GWuApnw}`Ge+j-sn0--s+Fqs{qIw-fGmR zejyrI2A7i~dn5V|8n2i;3d^@N)gft{=SMj*H4=5)d(cG>u`N>69K>lz z8$*n0`$(cP5eSuGOCp5BqG__aN06c-e@eE92J&ljGl!R=kTv#yX-=b3$x@0j*VPC{ zZ#gJ$cMCmAeBJ#0P+qZ9K%@8S2FkYic#cs$qzyVHxb}{?#c8g(THxZBZR?YCTh;xj zRmg0$$!u|~O}_d^Vs-!+c%fB;LPOrDnw2*Nv%(W_++Pa}jc@X!rHZ-tVqF@FM|e{N z^mROhNaOzC{oe7r1#I$Hld7Z6*3st;HsiI9R=xh3V_P0$`XNL#pzr#k1KHw=yxx2{aB)zx9`7&lNb zh;=l=Q+O#t(%Ag5{qv+iN)lw`SMqcvr|O59cZ!ibv2XtuGxVy+RJf^b66}naFbdH` zCNtC$hnjK5O$t__H8K|yT*GGd0gL;`+d8bGwGN7L`OfEMhDKSFPoJs4IFQ-!4GD+T<*jP90)Z{EOy5 z%v9TCq=}Le?L?gwnc2X};q{XKC&9l5Ezt2QJuxsaXnZl!esiq9mo>IXMX-e;5DO&I zZ&FkNTS^69gAQDO#6W$Df6Xx`S=FjXrI%a{+6r80q#IDQKeNXC&nsftm>;c1EiBw9 zmrUF1S(|n$Mt${XrU;!nI6z@7?Z{*KlzRm9zsD4X>tdm^YOCWL?T-;1P^&cEy*L#X-NY)y zE0)LNWWM-2U8)?x7CODg1|A+?r%U_H?esKv3LNHblQcx;ZJjn?5{dyBr7t6W`hNPd zmQ&0c=sy|}4)*6LHvgzpXy0#L-}CKRON`Ci6mMF0>^P}XSrE>wzP`JeMjTkQip&0U zl$n0L^4pzY3dr3*8_5q*xJ0vy5z-tqp%X8Gh)%P=&hGBa8pGX;VD}~O?>&~=y`4IT zk3IvKJQPs1mNnhK?uKt?vL;|*WEmUB_Uj0)JzAX?;o$JH`oG1?Er^A-JV_3VQGLHE z8!YR#HQ_&%#>_BC%Mhoe^|9Z&QT2UkC1AgELsjXbrj@#4j|nUc=S{H#bNOaU;#4PJ z&CXD18u5mIqm$H|$%0a3;S-&529{eLJFMNdh&xn^!i=>Vx%Zm^&KGsE@W!5;ATr80 z1=Xs%rq4GOnhNx69BKcrhjv6IY3QL}Ip~s$aF7!&eAx-}le9 zjpqA1AxG9&p5lUkT>(6KeoqXo6VC6(9E=25s@kQri=4%b_1yc8)u1&xC{q| zpSJ1)?uGV`c?iZn&w95AOYrfUDma@YK>9;Vx zGwP_5A(g@T6*Xz>!A{bi73_+U@`TAYcGcJco~nyWh>sZBctQVMZs5Yu0$z3gGpksa zmY1l*7=mc@U!7MX*Mm%wMvXNwZngDgGF>0uRR0J@9*(~$U7Ri;D>KzfnynwcY*l~v zV3K?t3A;8PzL6rtRCcOPXy$ezK2Fqil;C5gih03pBdG-N;UqMr$sj{Ly~bQ!PE)0; zxbi>7xy{8Jo%Vh9zWYb!_~6$26QAE2J4E!QBfc18I+9k1Zj-4^K|PB;qwlQ@!jFxg zT}G>+t4Lqsoewf(ei;&dk1;@%+44H-eoguJ!@X9z(B2Pr#nrRv<6Qs8oD{=XuX6*Q zrZWZau=Kaf+zV>06H0`ak_gi^ID(( zSlPj(uWXW`f;Ln!g@C`qVgP|bQIq+B*!Vio?=Y#iR+P#O=?=Pi1DlyS6CkqjES^yT zzAV+MQ8amOk~>Su!cekLudJ`lp zj%GoFeXrX=hH$ssuBly$Aq%s~`Mon=0#N9*rJAK)!YMt|lHP?DK&k$2s49l)CD*n7 zWDXC>U(suXs}v{{ zx-ty~9U(F&nRnzX!|8VOA*N9^u6A25xnbU37a{zGO%Mt^3=k6Wk(9N- zD}xnyGUZE`I&?IDl;hcgx8KNaG5jK=+l}Gj1mUuBbxr3OFN@ExKU0n{Be41Deqjux ziyT7#UEM4d+b(g#h0GMG&>ZA7XPN|6x67>=00U^_lv)5zl~`=1a#Y~AWQVteYPoXX z+Og3{$_q82`(XM`eg-NG6W>SF+O9Hd)FWjDPlO83+v$eXGgsyv@ywEEs#}uUQRkgK zSLg85&zUv2Pwswg6fydiVG8eC>+4&_(vZk)HNCe+^JnopDbB8zPbkPEbAT2-c0RyG z2GPd!zrlzG8ceyEV2-th_Y@R^u1jp8e!llbTGzWV|7B{w_x?3_#jp0u|GyUCP}Q8-9ds!ai)61H z4t;3WfAUpD%GwBqYrKz}th;XzD_J}fldirMn~{<6yqY2itChfw(DD3*nIHf~V&kH$ zoZFtGozpc=3JcTpMQ=#%paH+k_eZfJhi9VNEM=IwgU){}xrok0(S-<1av56pT*#!) zmd;LOJI3x!(|OLCEsJag$C@`2{V65wRv;Z%@*vbdTG=tE{MCsPNlz^eI~>RZR2SBW zI_ZI}%s(Tiad2FQi4VVRCep*ry9>Hsz%M`0NFqY^7O4-2{C7!h-*tVuJ3r_o<$)LB zTc(M@gQ5i?=vWw+tQydzAR?uQ;GMVhH`7`}x7*@g1_o>0^!)Vjm$=rkOEZV5#cD(9 zLBuzG3C7}hI9rSz8+3voVJ~)0{ZR7`k}g+<&I5oDJVWmOJcP|v|?63Aj$9Z zW~&ry&0?gF1P@x^ABd%)G;Op4AIZvAxvj)34 zPgz$nron@#y;|_#)`{gbeAfm+J!wLs;7& z0&sys5tcAlIw4K7J7Wezao7>p7;Rfzm8r~o4U3X!3rT%PClg$=W!#zee7fbj1G^ z-)>P6$Zq<79FsfAdO3e^6D| zfyeKM_qi;uy6-Lu*SF|QmtDKEXW*wKpg-jPJ8o$mu|Lu;XoY(uazAIfLYJP4?!7(D zLgHtEHAlD-8KTeh9`eoyGV@>OoUbzR^h3_RYMZD9j+$dZ!2I`xtfHy;O^dl2QSD=0 z=SAxovTeK=FTUr_UC040g~cP40Db==dRapjY2owDeG&}a8Y-q=OzYZQqXFo3xi;ih&M&ucNhP-0K*mRokz2M zeu@S&&EU{IYPTU6LP8YYUB`C1q;kil3cnVZkV!xuwv;(gc&_`#<|_cc@cFaT z{i`vfo*!4P_`9VKk@cHYX^Sp|a$ED;Q%!A8RkDlW9ZCu?mB~uOB)?6AS)0`h8-x^r zXRS0pcrN~!nN&Au<^zf`Z=me{4sRJO>Ig+UNN=obWMuUmb(qbZ>7b@;fy<+wD*yGx z!Ys{IT0J;${9VB*FmXq^U0JKs6|lAlkqx{#A}p6$!Ou;YTn4w3+E&-gUQQlP|8dgR zA#7TUN?UXduOEZ?#f@kVZWme)zetmGjz(@8&K*I}hFE}z5u#gz{x)b2=gn}6?+hjU_bTjteta~SpMWSeD(V#z5u#S)fet0Svgfnh$#EbfQLc<(o6R zTgg*;{<#uQ`WCIi!a_jsUHfK(`AZw1k7(kod90P1Bz1v-hqH`)ka#N#g9(&0GpT>c&6@*!+gIw&l)=BIB0%~ zx-!LTh<=D$GjT1tP9xtVo-{l1Iq1?p4A z!4IylSsoYT1o%^m!PT&_O`a-QPUI76M(Rwd>OW8={{2b5laa9n7hRx!&`AOoBYQ#Z zAtYYdc#;wCQSiA%4Z4FQ3vXNUC?W6cLssxH*{JluKeyM%!Z-KEKJS%vd26@O8fO## z6 z&9KHFC_@D`6;S)pN5_3CvS;jQ1Mf0n(c~Yx@1rvGG;5hMa8#CW(0XOfKSNH*+790@ zt!-5)mjED(QVH^*uyhUZLx(QUzNQr6@v$=ogN_FyYE>`<9_LbWk;6> z^Jc3Hw=R=B*Hudptud29V5|5QN_fOc89FBP$juxv-a$7EU^knecxRs}XUzSGckOe- zZIOW_oS@?M6gvJ38imzuwnV?Z_gT-?>fsr2cQEHLNb8I=)AW1CS_jwG=$yc!wOpke zIhXRjup{Bne1G+TL%u7wmxLrK9ES7_M}A9L3O@3H^bXUE)k zuaV`nA8K;m2V!>Lv06EIQFhk{B&MGzHC`{r$)^u@fp9_r%IyjH`@ARGR-@4)omNm~`9jq2{Qjl@hj^4r z)^C^T!&VTiDtM0R0?Rl%t0 zqt#prXJ+L3ND-oqlwPvMf{KR)dDeqIOJj&B>VD%*-ePe$A=776_oU}OK2G{3+j(?o z{rbmJ@FI=^P}US7AbIn_k}*^|Kt^BM3zof4()`7{=%bSJuzx_``eWAi@XxFb-R!hDV-B&o zQwvM?eAuZl*{2Y|XyK{w!OZpJ&P@g6I8uxW7G!u>QBKtm%u$Z$C7)NAjX(ziP?N*~ zc^WuhY*4w#>Baps=TSn_C6y-FRkym! z2_E=n1-cxFea{NN+H*P(T^Y5I&F;M#?fM4FwDne4%j8=*%t+W;54<+&ZQo_PpK_Kx z%Wz(`D&`Ymn)e=OCbgS>kwseVB9Q`37o3%5S#&jNjT_8iXD>Xt9c|yxovj_1%rA2< zat^Z6bpG`_BlvhN3?(@IZf4N-Od|uW>|Mj*{1E_iI{VL2d&0Kn{^Ae85TTZp)u~Fu z4#JAlZ#V8F-Z{frP8G!)4UWX;_Jm6kiFmP?@#{gmobx8VmS+wk^iqTcv^1usD1X&+j4hjgIt%Cv z=nFs!f=wSs_o~fB{L}EDJ#@O{y-d67ay#!rAv!j)X`l856+cD*k+<8uStyRQt{wt< zUv?ajj$D3xy1#s!ZPMdM>nRD&NkDF<8fh7#Ckl%(jrG%aL`&-L2sV=6)7eTOwGx%i?;LRBku+6Fc~kMg$4vOyJ)ODn7S5c6|p2`Xj9>Vrp4uJbsgmguNp)2*$z zs#pw=&G8d`xD49dQ4ieO?567?<1id~GKj!5=HNthho6yawUysdaTnS2x&00Br`B;o zIcB*bYZ29hd`sf#lCB9L0W8Wm*hC-Z9<@{%`W->DQ;WMPX#2#Q`mMu38`Fm5-zPI6 z_YwE^G`q@7Y=WLt?PXqWFS4BG_vL11EGM!}wAtL_qyQvPX5Ix1zxN8!k#%~WbvtWp zlNCwmY4{+*nk`>zwqL_;-dg6~re7^d-RPEkeg6SG0Hv1(d3LzygyZIOa(Si{fE1#E zWNkt5yRixR^RBA~4n0hTU5^1}k877p%TUdLYv2R>I*36E?^~BrhvZnS9B$N_H2f&+ zv}`)dSt*VygzLxJF;B^RO|0`N;{@!2!#}R%Z_ovsV1miIwL(LuQ|6<%=NH<+(3?jq z*v7ASpLqGTClb%$Z6kT>;6Xha0%xDU`CU}Ik|ZOS6W?4cU%7sAV!yg88#V`c0FS*L z-vT>YZ)AkJ1;5A>$G%WhZl9iX#KwWaygDgxPimD(Q)Q-r#$?{LNR{S{TDMz$jrnO% z5-;@X)JYe-tgw1uQNYQCpXG-WDmWomYZP;Lk2^R0?n^p<<2?N@pZ@Sv`C8LI)<&MA zu2{9Vm_S)#{H<6%e`}C-$!ds7K6qzO327RH5Kp?_76g+2PkKQRV0J9tp4cadVV#>p za9ybeo8XO^p^uu7^lZay=>0wJukW2Z(4gI6*gl-J`;O;Ag5AA(QwY9rm6km{>)5t% za65P4x4&r58+=yTpcZ=l2>wSaaw*<~?+E~-9!LoCzY^Wprx zjo?VAqVc|>YPq=a{w`34=t*37Wak>ij&Jk~HYcuxGa|KbTQ*XQd|u z4zH63ugL>QTQtfXHoYr*;yasJsUK{VaSLxL3UEoxm>!49Uf~7GN-mkzJJPtSkH$r< z$@&ZUZ>l=hc6}X<(U{$DTBRJTiRImsD0BJF$3anG)Uzf1knz3TM3SmM#R<9;pv8Ib zA(L=?!mm0K{dKNJhyuOX(ak5wFbP#Wf30)v z9h5;ET7h+xfyj#dci620915m`0A%0|=D+rXD~%GxPH|H_d+{^leO0t-6sTBV-YJ`p zh#An5ZHkSH_FB51WC%N*vNau8QdA-R{nHWkCNMw+#Gr22){@+Z9)_NXwhlT;zDG~ouivvnLKgPjP8&q~r^A+~9j`F#FsbiAx4~x>1~c@Y*75dL z{M`y9u3z*js;62_&5RbG0_iF7{zg{k?#2euOt_WCe>s4N2MOW*>5-)rh0AF?3tjf5 ze6JRybCEbTT#uvuRf(U^roPz>w_n?~ZB?=712Ocdr}VltX>WT{xF52_Up>O(6$HDh zZq%W}1{<#7`J)zo5QuzEFRNUbb2HnNS;jd9g_>Webr90FW6e`|bDc0AWb8><;Yg58 z1Ua^&M@PD98FqDiPo>1WzG2eNbopGr`|-^ppNMB)`i?KXSyN4jdaYLT4~f0@ygF4Y zSAGGvC)?{MTXvIi4VoyO3q7;lIeF6x6x;_`{`* z%5t#%iaVD}=?gdVv5|^lb^vdne+k~(JqT>Q{Yyeg-M|wg&3ZMm_l@h(# zX(;5cZCI)Wjw+ehp{?V)QRn>SsE2H(V0XcEB3cZ}(9WlC_8jPA=ZzY^r&q0Cw~bP1 z@qMhHFF`9}40>iSW2YAz2h#Gp@G#AISh7 zU@>^+?s@KrG7MV@*Z88zoE@I;-xO+}+0MKW1H!dUEO+N;1Ju67!!dX(kf?dXY*uR1 zt3qeE(vsbkxI3K086NDct66Ivj*O$*7+Z4E@^}*}GC$gN!P%YN9t7lWAfJFo zZtlvg`W7BcITaw~D?y;sBiLBbNFyzsOIBuE!Hnmk+I+0S^$^4ovRZN~D6;&0r`ug- zlm}>l2igo$x%O?>%C3xq)j(UCRFzKkHyiZ{`pe%*jKz6O+_H&Vl9YYSSQF%+W1RXC znsL-sR|9cQI$H3E4=B6rgstYoK=un^T|_k=u){#7+X{~8x7v_q+ABMtec;A z4RZr>+H?M_Uq8F+q2AlJ!xbm~SiIRg-G>NM*xizmc5Gi92|(rp&B6<&V449X1!S{}b0cN<>m;-~*<1FTFoPT! z{tNo)9pl~E&$00f#--_GWqIE=)yL{PASnqv(P3X^B}y{MwRaF+=Rt)t+j9H@1h=j| zH{Eoyq@wREj}ubclDhVXFK+?Xtw+WQpuj`vv=_QSNKz$^bI3!)27b>mcf~V(s^#AaGNI<1pz_|z8YoZ| zx4Doos_%67Y;6gS-&1v7J_|Fn_|!J7nU@h@Fsf5{%mk@gEVY1yBM%d^;CF+J8UR;8 zemlKMqY#~PWHdA2sab2RmwVUu(Aal!D9O>H#U1l`zh*%6y|t@^bi%e>zDNEbBdhzk z5#r0c>4vCKU?>3Z#P|ea`U~T5uUPg=@NnFPgLBh!%9iD@y+0lKu>!|T0Yae`7OQV8 z>cfDNE7hl(O0Q(j!@Ix2X;N)D-{=8<&&k-nqC+qRZYhL$otXCs#R1p@mMrph9wx(q zGM_*BOGk3TA%;gws>a^ZifjU7I6xQQX=herpe3+xKf-ubWCftpNp9hAL_zo)zF<-03g! zMf?o|Z!|nS&q!!7;=`rwdCD_gU$QxM?HlH0N!w5fm*(j7V8DPr4?x)W(|2*TH>WI& z)IPggWggpGJPHayo|L*mlnPx}X6`P$k)7{|dlYt3##02wd^#_A24LP>EJ;OE&W`&yt!$!;XhXSLOcN+TAO7o;EWh zNsI07myOsJmGAeugmPJ}r+!mMlOri@Mjp+}IREknPF|xe{*dQ5oA(5Kc!do$JG}SD z^<6=;Dc9!0=R6}7=&Y3%<~vHmWyQYkjWq*a?}(Iil2XhI|MHFSr#5RhFg1c~^<{7S z-+bQ+9@F5}xnKSYz`Y6V@QJb>D=TvkKM13HC7kVAEYFA$>MX^x^o31>xJv3@X$7V` z#BmfM0ywL=qwZ}@k%{Q{?N9Zg#E8YPoZ*J0r}|*{@o)VR`k7}1X@+$e%HZo3Q5{pFvw4ept zy3Ba>P#$mwZx>Ec#tb_v=MBw)T2hrxYwU5hTo|3;m=|I;_1B_Xi0+KJcbw$GTh3sJ zNG29Qn7-*dC{y{Ft!FPeztr*KQ3X`=9g*Wk9P|TV#t!j-M*dxWb}V#RF^zR87^(^8 zTJ}Z$ySr;SF|M_9y|_}(eOYkO)DVmRknVxFC*4-Fa6g~+l(PkklL--fwfxfy2^tX! z4D_?%Z(>a29G#%{U;BA7m8zVFj0y6GE%{lWU-5zAV`k2__LH)=AEj@WuWj-GTsv;# zGzi>?s`w+$#FIMKRv>c2Cxdm1WU=EaC*Z<|xKS~tUN$@R;ePruZHC3O^pVi=tdxJ{ zO4p}OK|-SN1*aOeyGds3eK5<^#P(6*T`N%;A;b`K1{>?oSS&QXt_#z%OQ>k=?8F^1 zltD0-Mly@hgVa@4@an9ZL6CfR#3JGE01mcQwOo?Od|6GFEreWt-TQ?5{nC;m0!0hg zI%o48tlrt=D$A*ZELwj`A={6E#X$p~T7P~WoE9Ed-C4=@J#5w#dh($`)#OoxfK@y# zvD_Xngc4Kxkuj8}kq&LQXKC4v2x|Slna|ekE&uY^x5Zk&lQ3W5w-K-t$G5!NiW3-* z+J)jVqv%!hJ%^+oo}Fj9{p+Xf>DJ+gP14^)rn(&D@5FDkrG&H+1w#2qaiKA2OBS8d9-{aX>X{;V0wY>ERoXR zBhwZDxqWuT|4EzwN!T&OitW(Ei;3IuLO82QpPR(|r)kP@7}_}FbpNL8msHy8w_)Y7 zuNYag59COiR|MY+7G%(p6uxdgsM2;DSB#KtWR%=9@ni1yDK9h=klV7*B^DT0Id$H`%yQp2~0QmqiWBSJNpQ^`vR zanG5tR$t*^e!}D@F_nQYB+t$cdZ0p{J)?6wujspK2w>HHMy+_;Nq%#lTk@>a;-6Cp zi}6-`Uvw8HIG-mWPuOS{-J;DqsCe!a*w_sj!$eJE>Rs+{vD-^<*7394x_i^_c+zJ} zyxWp}E3}vbaqqA~Mj<(e9||!X!_kRvE}(tVt`F^WFL>l8k1g?)R+YGx{ah}RYhc&W zVKE_PgO2aX8Qn;Ou&jchb3P`=>DAO&KwPuEzjFHxhHryZ59KZ!+t|6}w?37Yb^UE< z7`b2&Zd*a~+8P_o;m7Ra!3h!f-)?A`r7hD?vUF~7iLp};(bIfZXN%uNZ60wq9>7qL z`NT^YP>!)dai^_F>I$jzK|I;Gq@P9W!~g@WRz0qe=@G4BZ;_FwmNSo+r0BB726)ru z<4E1IVbz({mo$>j344pU!oYQMjlrLe?4$Jpdi#Mg&QUqiA=MKK-s!Eme6Ch5etAqk zQBptT-FA3^_WNRqw0Zeh@kt0s>=DqQ)b0u>j3OyspTY%RhnE!@BcU}lu)~@g@TC za z>hpWo_^oNRvUdHxm`#}S4u-?Lgfmyk1Vfzk2619i!c*j2lIxY_m(^vk{F6DE>+dCc zHQy;;R^~uY>O3TF zDKEW0Jln@#bnl7!RN>$a;nG7RFWC$$6DuU_EF0B73!$aqsj1-yEEl=jLtvoDG=j{?{vRz1I^vdZh1n=iG*U@jl#Jpe5PtkZ0o#r+Q@`}7k* z7uIt%UQu4XFF$4zHCfzKxO-{3`8gktn`i=n{QLswzyk2-91QQ?Y1((E;`teTW$;_; zo}t3zJPFvPF-K{I{EgcY1eYIzO{4D5B@YZO*pNSBc9{0Uq=f!3_W&PXb8>m^IXxL9AQ%H`YY7vGM&MWFE3c$sugW zqX(gRrnXsc5k<_sWZB@74wb*?6B*UG+|ja}zj|V;?NTr>n|`wuGMc;H~CH2 z8moIFVT{AgxgCF?gcwtJJS$;;1as_K9<{4*+#3K$soLf*9Jgh#d>A7izU!&vcu0(A zIjbaj;n-~J{F6!neEH4hmQo(z0d!uCF266NKh{(X|MBiPJbt+KMH85Q*BqSnxrtw$ z+-4OA4N#11y`c6Fz~Nj7EQtq1On~Bv|6+~-&LLZR9--&necj#`A{VmS1o;6T{hH?i zN5#d7hQ*2g0Qi#8uEE6HtM`igN0R64VY3#PGUVHR2`bC(f?}>OJzLE1?@^jFIG7px zQa+@k^#QB0YhsCP*-_^84KH)8`t}kOw=5o>`cAnW-<^FirR|Gam%Vxwh|4JFaauq2 zql3o8O|d^IA@5rBRf^u-we`8@ga#b_#*Vl*_t zkl=FsZXK`tYYuNpQ&EycI|G#+>tEURHXQfwZM8BG86(x(locZx zDyo#`I_`*-k>5^b91)LH%Q7be*q@FyD>ZY!-wKpfTK0&`mPg!3A>X#6mLzRFBvFZ&Vk5@A0 z5jV+9%2%ho3M3JAJ<^B^qmnFv|chq*=N_y1QEiX+%mux}|IB5&@N1 z8p%acx>=g{;`jHTd7W{Ff!*i1eLwen&OOJXTWTWRo>1(T5}Z+_@eLb{5OQ@MHr_F_ zPjvCiPnHgK%+7`27bglZdoDDek1@q>9~`->Rh8e~j%G(IW}glFvc`(~KUhDIkmQQs z_TgGA&PKjXb=z9ROm`*-x)k19wJo6cdH;)M4H?K#pb_iorhsyko8jz>)_?(UC z!+YCLx63vN5ag?=xn{qwMNbo9CVuTc0u@ zdnPQ&^p@L2YjC%*XWnXia6V9JYdz9(Kllyo<`)csNuL1fIB)9=aUDvgv?q8{-R~qr zz8-prxj`xkia{(#Gq=<1V%??;zIFT6)Ih|POTO1rhv>JzDiQ!Zl#iANjt~r_c}MHP z7dT68aQU4;$-2qc_ccH)7B9DyaS6;sEahS*wj5`zA47YwKIs@LZ}lxV5$q{eLAVd$ zXkH5s-H-oJZdYbxWRK_t4^_Mn$n<}_d4&`7J>Z68uPm|j92Se_1uwil=NowMmGLO< zoo^R^=8km(()^9hMp71>jPvEhd**|MZyuEB8;(ihR;B#t*5PC5$n9CCU^nXsuG_R+t?~hZt%^;_{yDNUllQ&R5EM zyOhu&#T-qZAhr>3Fu#8D;+$(Q$u_7nU-k{2(Rix&x{~~bTwahSFS$LKiMKM7$4u|%d|3uhPbSW`q`r29+?JiFAe=xv z2J!PQV8)yMv-2yWbeFdh&J7KGnJ#VvurQxn_wiqWYFYxx1Pj}2Aq)kT%~`8qsiX_h zU0JQU7z8LeoAbB_9XHUn!FaJK(OrBbln=BJsEPB`Re@7Wb9s5`-Kr5%hC$51S{(!0 z*}_1ls&nTnuyv!QAqrHDM)$eA23@wRy~<)hLV-yh98G27RfT`Hm%TIB2}Fcaul(B;MJj=@*c zQWdodbwT6t5^LGq?1-Jzz@VpB(VL*HbYI6JlZUIcosOg@-x zr^TzrjG6ibS`xG`^4j)>u`2Q<DZ`o|LKc^GLqbF=evACdk-ObQ0_AEq7Vs1)%kV?D zSg}?merd_tLii_xbU5Cx**Uh&h?(Dby4xNt0!3(_I;%Bf>g|dQwr%BL68G=)T!IUf zJ#p*osiCDemX=A+5_pIaUR?@I&Cd3VwE2P;@K z0Z4n}uyt*wIuVl^gmc-d>eY|=Y|*njLp1@PK48W!Nb81XtT_BL z`6$?7iEWQ8zFs1|1jslDsE&>NZW$DoTS;?1q!T~#+!%7c5@u1bX)H;1%IQ&fB#AYw z^le!3PUXyhUGuL5b=++&DXGiBo$bq9ITsT@^ovbvc+O*c(vke|gTeCJQ6)uNbvpMGf%lac zn9stl4;V{D_r7JEB7+TmVrosQ3ti=kx0@v1J~vSRHRrf@t^vt(;iQMsQ#5@_Dyti)|cEvYuC)lAv7N9{Z&?3kNuqRCL; ze3~qY_ZIc7A&bo4BSl*gmo|7XOHrCZW)l6zvL$qBZqmSstor1(QEpwSId6k$!Oy61 zq`l&{x3zpTiMWZ3Jssr=Nh1Ur%#ge>f?j=T@!Or9-KOCPJQ8=ME% z?Q8?-#3vieV}+=2ne`I&=47G$o_IGl&Gpaf9Vl2#u+GmXMr-l=U$?f_0~?t%kVCPV zFaPb04xMZegGGooS*3o1K&98;5YT?(erGnGahjfFXg5`)dWIRC8A^XTjW(!bTVh}^#&C`JQ)LZ7F*Aqt{RABpIAsQA+ z4pE;ecV_wAIG@(4^08yvcz(n9(MkLOx4>dVlr>$kE&Sa_6Ga;S;;~zD$K+VV1qC1N zc$(S?x`(tNNe)Pbna9)h!j!SY$_Df?4bMFMn$7Uq(7&`YW#T$im|klB_IN$3N0)}T zGIl_xScHV*a{hOidc{UtH|0dSx?|Sc(`8us-IDDED0#PrlSiixCQHf4x^hqGZ48>VBTyfRA z8dKmQO5z`4bk5O>T{k4sYjZrY8U9bU+LS4$8-2e#JS`TJU;bUj1@~nT(RU*8CxHZO zM_N<U1y2AuZn)wPS}g56Q0lJAmmcp52XfyDiS`ug5d)bQlbrzI_a6`VPSo zm-@A3wCpuwWFh7u!j9w)nK^Hxi@!xDnl(I0w$(cnad7BTLZ_&E?_Rdl;5#CAnqyo+zv&UsdY@MoM`@!(!h5wy%DN z?Tn;&DZQJsbj;$Si1eHiyXHx~@-_yWe=eHiQuXYH;e|rbJ%su~lvG7<*(MgvE) z&!?J%S&R|Pc%J8?-T6@+9dn}ZlELl0XFXzK1-?^41;|M5Vy|XNnatIe15dcE?t&=o zt7LjB1QXj%?mjwq9xnvo!DB{bHo(O=^rLau#hF1s%CAC-HqIZ4Cf=x_?Prpw`*H_w z-E*u7#u^v**a}CVg$AB6`5dBmFs%48!CrLw&ZTKdxHo3H@43yFI`gq@41dP{sTnj$ za!|y_Ic@xwnbyQ1f)ESrwLQB2B~>LB+E?>GHC(g(1mN*ld$@^MG}WAaPpOGo4n-ir zcbG>jhFo|t9I}KhCkz01a(V%O2oaor-#idUxm}^S^qCRB0*Nhu;68PWA3Ufg%P9J) zb3bG5eW+zmj~jgUd&o(#`}lB-jgQugj|?#%N7cld411w6{sQw);APtW;MGZ8Y13pp zwc*7t+ydvOpNReQ9)ciLGBWV=cfxi|>o5En0b3kIK78B7L$I8@Ny$6qt+4^_M%tBr z#g6^w5DG)7%b&%8g$+8KD9p5S2dg9r@b!%qD%{IDr-yal6&`hY9gJI;gl6;+JB2P& zr+@hr-u|@5x2wne@q%`tWj7vH@lq3$A=-&Uoz&Im%y8G09P`5jJ}Tv6z-zj|F=YFZ z*V_~?{&K5@nwLEJ$pt9Q)+IGSarnSRL(O@9B14Ku!JMxSorB-P_-Or!lR3q-pz$X| zB-lI6n08BFKl|^6He0_)%0R5?{2-73Gre!#llfcg-fBOW0A5MS!M4N^<}FRjMVAwD^qy;bp$0oen)Kv zcgJgZ5Jd!`5HzO0UixhjD!{u#Te$^hM+meUZ@hwxpPqS;Ba)}nr=5`zrz}io67Qoh z_Ybq{nCWxDo^}XMy@;R1ueIM}-5e`uD)Vu*D|aHACBDZD%meh&U{HMi5<7HQi4Ws%Nx6tnCI#!$bH{*9?H1jR4W*) z&2>LQM&D%n*6c*i=L~$=JRR$q@r}sUu{K|CC~>k`CPJ;Jbo3B?W-lQEY!rQ z#Bkc!`Ks+Ph2=Lydur$O(VR0=pF_Ow$nED0fK*W;g~n{hJ9idue!<1M-IYQKo?Xzn zuGRjP9sJU7nYIAr&b$76SfA}bj+#Y)_OO(>A*&!Rlv}@z5xmkUsj^Sm2t1M?&WAGk zT$Xh*MftdAQveBI;{r{CxU&qi&E@?eeEJmPP*iwhQfHGDZKx zj!P`jt zln#1!Wi1!Cq24C0s6dt8*$PZ>J{{4i@1)cvEd|OubQ5O#pS^V1QGPvY) z8irV@UyLuh+Eki|*G|LhAydk;C+(IZNAmVNSb~lOG0Pjh4@t)}c<93ALTTEdRs1fX z%(QiM#8YC3Z-8y)PL!nNM_VPzB}#z$xcBncBH=fBSVfy6nw=d05lrZfPA~~RO@3d= zK>AHLke>_RVllwt59E5mhO&b?HA_6fXC84poosGES6Y6OU-v9*FyjL|>q=P_ZArSF zCcGnPaMdK((d%igD7dhaqrHi3g{0Rs0`Eh#?d>{iU}4(kSjAB!gYZT*!u(ML9acM;<{tJ$e6W4BcNfevNPYxLv-!-3__p80tvn z-p6qe$55HX$@#B@YoUc-(X1ZgSK&H*UGfRI3+~hjQg^ww`R&F&=+N5q??Y5WV`gd3 zLXpfLi&H(tFbgzg7-?aQz@+Qm@vh@ z&EK8>Ju<~TCEe4-|KWBOn(^T?!%6Df5>v6DA5f>*`B7|gR3NY`ieWr$(DC1=wV6^x zpKvKYjq39-y~GpZK=;SmA&5cuV`r=hLN`8L`orp@XK-oP5^cHEWq4m;wuj)5qWeQT zsJvA!L=G+HXLTjE0d5Q`6kU6>sB!~&oV0E8+eytq#~2)zz9;*=^S+PrH)XZ4o>YGv zgl_K)@~ca>W54^vrD^%x)X|jx`ga%qdCl8*Pn7xpP&-XnRyw11@GU7WDH3#0w!X#!*x>(h0d(a#pAhl?uN_3SeV$ae zm+xOaz81e=_}^nT`37R6fsXQhRX{Jj`fm{5*XuFckXN;AUEU)b5|8`8dr0VGPuQd= z)>HkPEPFD}nD+nbGFtxrZ?a^pm9I1>njQE}QO4;1_mulD{ri|6_Nc=Db)G`R3zCvi zc5537?)$>&QQbe_Um4Pq3ERPJWcU8t@b5Ps5<^K{S>pcMg^<#ktc?ov@;8)yZ>Bx1 z=b+-Ai*3G#mXB6=`k}k-GOMIZ7X7YtCbKt)zoYpg$qIoOgRV5zt%VDn|2-K_AEsdX z9lG23B=KirNHqipiKv?kZ~3svTyJVORB~x-7W~^IAFM4O2r=tI>vvuE_6BaZzQ8~C zYc^2kY2^gEbO>vM zhm7HwP#o{W&uw@z@}Pt>JUb@;b?JUgh~r!^X+nFb>^L8yw5*Ix3#qON2+>mEutz!0 z?V;0fFu;Z4j^SfC3Gx>2ZPVvkDzE%rz3OR28nvmXcPW9`Ip(B7z8XshAsKtqSf@MA ziGxE=D1#JhvPd^{w|7PW$dGDD{($bj6N6D#Bz+}M0QVGhfXpnO3+}1`_3we>Ha+mZ zQ&7|pGfL4Yd9?E%TrqbO)6<;4l4#d`7u6L?8>9<>*tv=9MGthh0z3w|pcDXE8ebs# z(#to5KlgdA7`(+gjobOtrnhoVBjguR!o%d`WCW*f^&(U0<%F<=`l9ppNNNGSmh=gqQ>as%>ZG-rz zZlWPu`a~D@KtdVF&Dkl!j-Mh=N;2L~xpU_5XcWdn?8t~N>I+}s_fT&6F1nv@ple9d zj-YZSL-u{M8V!IpNos<#^OEb51OtyeO9S-?nx;ki6@;ZBIl=*VKI642%45kSNdNaY zHKIV5pG=|DUD_f5yi9^BcCj$Y&c5=23(bc5K9Q_L- z+D4@e8YJ-#B3Fc{$p-gx6-wtq6o?|(3O}VmWuzR;P9;GG^ z1-(rEXhj1CNbRC}-((BN<%b9&gZ|ix zTPYbNI}+RHHU*c8U-WQY3dfDl9k;z?IQkNSr=yT8S|w!6(vZI4n;{xULY5HJ#nz=p zl!i6W%3%FLD%Y;xwS4eGjwr!CRNmG+#WK?&-3m6QDBXvw%s`BJo62$vx@Kok=v&;% ziKfCSM8OCXPg7w5moU1w$hz-FSW<`vTDJn624W!(f#{P<;e=5LaYE1Xp-;n#koX}C zGOVGHX*qXRQ+S#L5o9ei_BQhVxFVFSP-=GGk0jt^V}yy8f46AdzvLL#k`K>5j3FKJ?zpvaf=z=~63mvMA1@l0yYy@eBy_NPr zl=dJfIZ_YT9>&{nGaAb1-gP7}nWZ;{fM&GRlE*-;0BM3NxP-SPG<>Hp&=77$xv>K8 zWCJ?AfioWcu0_A0fo~(ko-k0LctfgX9t5a0EmKGLc2qlFkrMeb=6}C!XLPe^LEU9m zyRYVgYRI5M>UK-4Vs2-1*H((cj{qZqv`#zP*H3vO znPGk(u?&qpuAWSxq;~k|pPh9=yhtFizKDrfpq{L=-U~dc^ncVaHsfTM5`v@_zfU^5 zwKtpuNwrS;cn78c7%W<_V`E;tl?yPpb7)Ve2iqJr2bYSUtkotNt!XN7VB`Vu;np!R zBE2u_wSFn<$)*AV0xxosW+^{Y#`ilv`$?dm;`!Xg5O_9_b#X=IpRyGCjUa#f;}Tq` zN%|T7mOzFm0T%KYEO#Mah)a@K1u2vz@pKFCTNs;wcEnX!PKE&uu~4SrsR8SZU^hgQ zbOA&tsF8rGOgW+zJGNr5j3u5oeE9XDXeCgOEaEIQ|Rr)(pL>rVhpD}_<)V`gPQti4vTp3k1x+yXHW)Y(&nJ>G3ew z#9{BQU&7qjF`jw4wDYu_kJ_|QP1Z60$*A1dH1a+xZh9zFb@U0BTic|-+&CvmEYUIAKoAHC8+$W7E1B`U_prt4qiYxuNYSlu@PmYP%~ znc66*a}LVAzSbnaFXJmx5-)IVTFx-iotp9I$MET6Wyh1zfWjLQLoKz7y&NlB;TXf) z2sDEqRUl~YeKQ`q83HuB1b{Ov!4lt69+1bBDaB2alD6Om3u77dW2fEveqLNmew}pw zC~pXM7be)2uE3S_yzDBV_N+xDtR&rm`=Q6cVxijtNXNm3PckwwO_4f2={p$&nilOq z(r`f$Zk`R~9ktcjik(&&OvfR-ALdDtJ@#kHNlJgl5Uj4SYpA8wU>O*AGdRV1 zvNKa>H<~ANibO_6Ch>$A`ye^N&0$_a!?-VX;R@;)%*^2ZFuX?mqZUdAlHoMIC0oP~ zOodX~5Oqo3ceth=MTU)XXVs4Wf<(>4=QExQ5<}S%a25{p^mY9Zh(f!@S0t-dnk;;n zjwDJ>X(@|ldIHel<^YBma;9p)nIP-5`*UGYQPI;(73Y{LnxK(C-L@&hv+NcZMKd?9 zMq|#M29-9)Ej7B)h{h47l0u%`9)iNnN`F_0>k>WVW|1-34jdhb>n1uMjb}hKPs5~~ z{rfnPFTWI@KU48A8ryG=zdB_8sPqPpO5QZ1LS6ib*&M}Oj-+Gl>GB8qg>JfFiGm{7 zBmO|y=!>@hK3NE$%Xpzt3PC0W_Xt@FD}j_AECHLk(X{*R zXG^_Klx+}i;)9#!%ri$1JH1;e_=s>m^t4k`QwOs$et7#u!$RU^X*0vy?G}3!6N^vu zXu~LR4?grv^9cYAtrqaz*pQo$qz&JA4=<_NSKw9eHhc-#3aO+mrlo{&xRu>2x1FsW zLOQq7sk1Hhu22k;rqmT~m>8&w$2a*y$#o8;SAn#uutI5c=mN_*T(x-I8b#nhzt2q{ z1o4cYl8ssZ;~OcV1r?BDTaIsSKU79w&zQ?e0z0^`8nf)o+4D85qqzNw{*i~ zC&1q%Aab-b+xc@I>7UNK*xyjmP>%(DNbh?RwPdnp#NvFjQ0s2k1i!>EJ3c;B)BfE7*3k8*)Ec0((E8(ER*cn^WS+#NF*&y;P86gL!~qsCnWYRe+~;Y~ zcka*B&hqD5Y}gLX%_cuQU*95JsbI~7daHMIRYIFStN=U*>quwStbTfBXXJO!JG-g) z)01FsZ3MnuK~2JM@;Q*$ex96aq~tQkZJHjmm?y(`W6cn0e>XW;+julzhZQUF4AEis z&9;Eq``~Yf8R4umy_6jNOVe{JLag)6O}@>Cysr)2Q8#~Tq-&K4t;@5j{G@k}r}yFm zxz-b55~0O(4-Zi6%YzXgU*t%pfII)R2%>dL$Wq}r$O-0 zk4p{WVIc23)g|cGB#=0d^aUK@)L8gYa`N)>SQ6yhPyUrTf0y^A5}pihFxSq|f( zfRJ#nmffCz$#lu4Q1jWi$&cFdu&%VOPMoBC5uyu4F}nSE+ua8_Nl{j-i|W$9SVEQz zn1V5OP{neSX0@cfv=qQg3|TzGA*}2SG77*1sdW*$3#_o(WmbI(H3=o&$P-0Va8`ol zJs$+Vy9~7&vMD<^m%7q%k^L*QN8(`NIjq-&wbsciXT81FUPgM!!U_{k{N+A8SV2i{DaU};@2Kc94=u$75{OInMoiSiOO9Am}4 zXDgvDLSo?L*pLhn=M6`%qwT+F2ooTVTcOd^((=rWDBt%re)Wvh&-!q=*Hh9GT&Q2T zKSv+>h#@;Hw0b+U(ZyHUr&~$Djz1}s(t{H;&0R(nA+im0Xqj%uFqK#!9fY7hF1208=%TUlz2|h7_6Sgx$~}ZQpM5%!yNqi2dU5KxgpmWcDy#4g3Rd%m$C@|l>@1{&a zRv?|NjZK4-Ki8Bl?PRg{i^s_frxU!6I!c8$Vk=+B>)37sAT>NGDv9YQJd3F&xtFI; z&Wjh+iQr`RE>#jlDoi#)9p7DFu~rGOu_>&PrRFja%_FSL@QiS^j4}rZV&av1~5b2BGYy!9ypdxO-~1-2cQ$euD&QtOiq%qSUp6II?K!%W&}+h zjZ4C#E?QKt&Z(te^)V)ImRP3tzSj_?qq6O8_t1#>(#oe3t7DC~)9KfGRAAq>ad5cK zZQxnU`$7saFqpf9xdWV4NqusXZy|_+te-x^AhuL&yNZe4uRz1P4rF4O68Ua--SWII zUVe!|z9|(twIJ3pZ1{eZZco-~?9>wTLX{QooNvP zA3vY2bCtND2MpCV=ZU*f*V57OT@>b+wwLj;{QT&n-_GkX=e0~J%`aY2o#-9#1ZLv_DkO{SRBU&oG9$3Yj#Ee(P;4KYdA97+_APWv(nviw7roLZ68 zTvyHR2g}}$FvpZ_Syg!5aK7DEnJHz7%ade^Ox5e(BnB1ZPrKi6>U>h+Jaji|p-ua8 z;x;3okyuQ)o)-W}JHPGVKZ~SBeaPap!O?(XUJUk zj1935Ma3E4L$&E-G2yt+f3|XRD}M{VnC5d{KCg)yi#Ss>k@rP*j4nkr6j4|Fa)FP_^{bS)k3NBXO!zrG0-O z`!`E6zrAl;0c9C^zh*UEbM}_16`9dlUGJXPP_BK$@2o3{-I7a0gf832Kf7Xxn^Vi#hp+d5Xsvj}?eIw%pWkHZ3v=wcGpj3m-3t zhHB9)8Ev>R^v3c05DqDVq$qr1{PdB@q?HTB|TMJAjS zEsT~tN>oB$DzO|HT&RVmlTo+RmXETh{>A^Zmhnj|Hyp!G$HYH!j%9JgKG0pzb!}3U zrd|RNypbZVSKPK5sTS-q*Q<{A+p~jP#D8WX1zHJpc4-&17Ss1hjJqm zXk;DPlKHC6gQrv7na8#np09#uX`S~B$LbVN=z3|fbk-NYrrVvD`d%aFTfhE3<{^?l z2xaoOKc0Z-_Y1(alJPtF5DQg)qywynKA5Wx!Y0f3GG$fxKq2i~q$;^(>E?u&Lp=DC z5_GiDN=Eu07GH+~NVt5)u4DT?1*|-B{ zo-ok|G`Sz~Kb7x1^%@oKh{9$b;rcV7z5Vd!iYV=z9t?KT4Gl0k<(G@@La-L&CCtvY zv^1fHVn&ml!K42B8`&9-8_yjz?}834D2B$bPFVank16=r#viVxnmk^?CF%m#p2v7w z1sGzidgcCQ5r%LS#%aA*$e)q_aUVu&JbwJ6AUC0JplYyPP@7irZ^(_vg7NaYLG+?d z3`mqN`a^Y%Wf-LQJH(5>rvv(f>BNsOD5-vRLUB&u$J$*qMLhX2=X#o8Z(E|-(&ip3 z^B*6V-F+B#^;sMVHW6@6KWC>F*JG(r$ujFF{GGuqTG+tOvTPO_nB_Q!&`milah#j3 z*MtI-jnR9Q%JGMV)h~~s+{g#SkqxVS#(gc-ppP)q??a2xD8f4j{?hk=F zPano~zY|%|wi{C{C{`L^ekOA=94TGdjN_hZcCqU6qALi%P?@MCu6Urc-;Iis}v^v$7sl>Lbq?&G77qv6={~Q zwmidE%~i<}p4T;L5sc`z#W!hm(HrVFlf{}t0p^jhp?qv~3s72`2Oo6!m5&37Hc{D8 z>|_eQ79N59%^;`IQg7Qb&w&rX?U}%`p$>RXAOToxRhHx`H^fS~&)Ry6jR#}ax z7k+gC<5`U6inCs#SMth!VH0IlBs7hV+q(^duu~hMeTJRa90A`wExs^(j~991e>6b( z3ggPQ=yk8FoQCWCsQfF&Wg*J21R0zyfiHd{d)61pDu{8IZOf?_tBMdgl9ratO-qinEph2sTs~SweA_J$ME5{ezwEbkl zQ5ZHYzHhEBZ9m{`3NAA7`?7Y|+zP9k5Ajzsi>tm_-uZ&qiI<)auo&CB&dk>f`U~APg&j{} z5GGiU!s5Vy^q}6`Fd6Yc1RBj+^;}R~Otl+@uF%ef#>pIYsrmxPX!V~m`pSe2mmUeNc7N@QZ_6i0038Ig~t|p+Rm_J3{(k0KECTW zb7x#cGukX#mXv+@Gp_fDPcnot8KTuCHO^;J0dV}v`2f3=hyOogg19tINb(_LN|8WRDa=nNZz5s^=-Ef5=i2pT z2B`42YOODoZMR$@$9o3JbNo7u_YesbN!xY&NkijhGz~mG;BhNoFno|wEKk2nj3W{p z=O_TsmCAokXo?du@_Ewcg+HuYUG%~dYxyD=5uE(0Nby8>xBULKfJ3Kyp1RUd!_~? zLz6A_T$f&$e)uK~@~@R(X8@3G-A}mBkX!5`q46G(tt)@ot06vUo{-;jz;%A;rU!Xl zATZ+>d?+*$)ciynetRn_B@%XJJFVO8*LfR-*sgM@fr5*yc&uedUzL+f=2Yn6&~I!Q=@i0ox1z#|7BswH?>rG5z@$T}e}67uf-+ zoReWCv?^4>I%yh$0Z>gOmrjTs0dS!p0=G{WRT!F> z+|jAw#`KzuA_Hp+j%0{B$W5-`h&tHWQQ@P7P9r5jlDlLWcvM&#;x>GP!IVR9VsJA6 zgHU)SH9r(pdKJ&KJYvToeZ8@3`y6M9Wacah(u(RXy;=qG-DBOt|5zuo$D&2;Y@udE z7os5%G|NK61#jDK*ay;aV!(Sugn`p4asWu$qk+8FVntQee#D1gR~!SZSI4r_eo2bJ z+PQz^+Isp6I6DvV4b3Q&YJ@@%eC^4QV0W60Wd|ufss*~=bmx7pUw{p^YBfD@-bI9) zk*0rcK3CYK16lCI@bGZGw3G5>go$bJ9CSLa_l=3gD*SmOf=ZH|e6B5Ket!;nQy3w( z>ARR6IwrsTE*W9~Qdrsoz;rhHkPO}m@@wkomJ<7;`Tf+Tnp^HmcmuV!Ku|8=e~gKf zdiy89vNR4uTU}x3*OxT5@4R@7+eRMrDPLkkIP{ys8`O$-7WF_fo1lx`R85G+xPi$4 z07sr-*Z@;PmR>bkC6h$DU#!rJlIjnsLTuB+*n;MeB~b5%>$TE1SPE-*lafWrqs0lB z4*Ly%C4pqR_UlC(*QP{1(Zg z*jXMtmPAVs78OM7;Fe?H<$cesH=}G1`KHl-Y-gX58D&f_$)ZJE4K`4Ikai_yYR70M z>j<_OzK&H>5C_RfEB%zUB{pM0gOo|kU^)qqzrzKRCcxsf7-c01@=m%yf{@jwGv=5* zYVZ%x<@Il>Y{mG9h4Bs(<6XBEnY2Yk%yn5Rb*h!{tCL8;Fp-h_;*-!F*|b<2XIMPS zmv5wC+BZucfSYI~6jukvBVR|M`IBls)kM~T+AP8Z@qCYMFP>LcPiUW*Ms~uZ)L^6YL zbrq#M$P%XUdvsTQjYSxHN|QPz3f-(5f3V4{{u=uh7V(28vi_ca#YSCoL!dXsV8iD) zX8eRJt44sMktYl=CS;h93Upg=OwVZGbimLLa2Y6be#kD?mM3(L@MsO(JRL|MI>mUw zIdmdc)yNuG_pHle$#kF=I+%5FEavRCzHhHknh7?&A`9pnUK|DnH>zs#gKVI140)~d zc%XJT$3n4%rgGZ~h?RQK6zi0Q{g3p+Q$H;t)GFSp-mFCQQ?@9p?mMw3Vc`T866~0+ zKpOcwT;He>btn*r!`Ys#Fg;eHwHDJMRUKsmJ!Czn`o;!KE_BhjPtJ5xxtNR zt(b@b0v^3G^@Gyy8mAlrX12=Af0H)fR<|!iD@iIA zCM+#%pg6#_8(R`~CQy*{t8Yg*jra+m+jt&we2A7Wgy=nS2{wHdpu>U0$Hy0d#!i@! z&$Ds3m+J5CBY12^rltDv^2 z#6rQh0dmr}1YO5%{SI@B?K)M)UI1>R0>lnh22oTA>hQxVuhZEuKoL2W^N9jEl#{)M zxmwWKd{p9OO*l4=Ad_2@4-Nxu4msS53sCTnA=h)9S8;K(l$%Ic)b+<+1m0& zK&vqU`qU{<$81NvsSVJAmPR}O<49fDG}iqE-5MKVhMW+1rBu0&CQjh*Vzgy)fJ5J+Gax6#bC#0$S;u35Muj^1GVV~Mm^U# zNi8j5etK)e{L1cjxV~Wq{UC8}z~j3e`2#FU>4wucIB=qqARue%lyG@@DO0%}H~gbn zb$uV?R7^ks1tZ)g@`;(Jdxg4ls2)85`-5_p=gjb z1&pE|Pz9bk6f8jJpaTx{Ts>UGHM$OdX0OtIfA2vCE3sz4N*a5(<6LXRk0OOqswG|` zU%*R&^OVJhuLvdb2IBGInP*BqI`B;M`5|*`n!Zj+^bqNL$bIESNy>D=4F;*a`2L%MdJMCHPycZz z{;_8MM-u(Vuk#bt&DVQA>i_M23Enh5`Ty8)_ee|%uh@ymKf+uK{XHe_e=M-tZ>*u9 z-6Zb^biF%&9Z-vv2NjFVh<>zkd#rC_0G>wd+NZ zVj0`OZl7SOC&&?d3()K)rQkb#s+<44vtCxG0u4iERV@cuje&1-%t^DQ+1V3~g zE^AX>{bSYuVquP=Hj2#Zk3smA^mkB+E~VM=LQ9YVl2s_h4Jqw5J-D^t@$G=vQRHMp zw!VOqoIhxtEl_lf81zfz28GfpnN=2GS-?zxEi||8r2sCZ#3n&rn<|K8c2w8Y9QHdr zoP+L@B)ne{>IaehcoM|lvbGOo>}+1T$Af*n1dL%6=BOVaTSR`qcwoUt)$Vl9J^W;b z?l?!s$$*KpjRI1xs$JAw(y(JMkW5cwX?1)Z{Q8ka`uYrjoZ%_Msg`<(Xn74WItdZp zID2UeHL%vy=H{AV5~U%PmnB`YkViXaC9KI431d;>JOg92nGyKiyf?NQ``7WWD4vCa zg@d!2_rP6@E|?7LcG)3Dj0y60AXYI^_<914>?oz0c5fke980`Qav)I(mc9kgg%6=v z2mD;9B}55-x>UsdPJI4t{>{Kk=>u{q5*)=O;)h??pHN9trFc4Vf#DRggqs8WS8?c< zNO$nX!0=}E%wYPvtcR}+3Kkls`}*2LkTV_>QI9Kew0r1h*_Avq5bums^Po24Wuicx#rZZ5RHF-Mj#=Y@NWJ+_^815e4 z{<tgm&{wEey?#7%rGw?ERoJ83bXa;}W<+fS7vmsD0(Iyg84jOmM}U&Tfy%TLMo@QIHQI? z?Y|&$kpwn)RVMnFsGXP zKGP%9KQvdV0-o@EHxQwP(v=JlJtEM$nPOvOgE9M1IFYS^jJ;)=IoLHEH?D#}2ymwCc#w<2a<5qPhBpFZ;xgnbWo7S}Ns=Dh>q5|vQsS=x#FG^M+B3p*mL=g$BSvWu^; zuf()xcJ)*c@Yic6jPIv6jz5l!i1?vx@oo-H-=0%}b1|`T?>UnBV4;lH&x2TQ_#j%y z=1nDEpL8sDaCvEI^Sf{F<-Zmct#jve{JgkJ!bU<);WVMc49X00%#Q1zHy?z(HQH_V zMSdj7Ll+-BP035&bZ`4j23Rk1s2fOShSTC45=dyetzNqu{QUgRp7nUU>>C{53!rkZ zDjGJB-S3;`V&)CYUZ+zAKMm!P z1m<|0z&^fyYOggd>e==Hv&~Ny28LQ}M>WFF(~np87^)T=V`c^rH_@S1FLSu9UXE*Z znT8x49l=N12VDDINg!vl8db)nDy6ofzli#c?H~SZ-W*_%a zmN(Rix@E;aPF5hp`{-CfYZZgs#0S_B@NZ7ZeUzz7;uDU`3{epg@)Xd=FHMDA$V0f- zGXQhP>F!pSoA)U2%|jeCBeRV~Z8LpQHTRKySR zH|_7b0I`=!4Z9qjVQN-BVQ30T9E|1CNa?%^R%6AHaZghjncfkA8XFwhwvY{{IUFygz!KzF04asMrR%HU>0@u0LJ647s*MI!LqHcn z8$KP+Kg$3B=Ixs~z=s_V`4G!ZkvQ#IkZj)JP$3RS+>nGQQ=zp~whki#W4LD?ih*AJ z05%ihYG4xF8|R3gH#M6&p4O9(6&`BK&G@%vuf=1k+O&_*d{D}sqhp@{6GtS$C(GN# zA0EHq%PI2zdOORYxPtFZ5AF_uAcG|kg1a*i2oO9-a0%}2HdqJ|oZy-uA-KDR1h?Ss z5*T3cL5IEMxBuO(+S;xCv{VgWuJ!Fc{q{MhpW7XS<|?o{Zpra-uG%(=iiBita=K)s z81QVVT%+oQNTTI)9PL_wE+`-)bW{nfE`BgCRt|){=n{bO16~ZF?X`ciu(NN_z@j!^ z+7SMeHcAh z>9UNSwkm@*L`EW*YHT)38m85FdW_cnmUXc)y_{b=IUUpjbeg2a(D3y9gYxkAq#{C| zo9V6~dM^O>OALCMc&?+X+aLmH1I`0g`Us^aL~I3d@9^2L(;y(3ce@4%C?O69`)6Z< z-7px8AB`81gyi7zHbzi{5k7v8I!dg)LWmIq1Xc8%qk#TB0^YK}6M416-y`w&25Dq5 zB(n>i;wHtn4kG6vvGpOcSwVCFom08f>W{5w>xM{$fdi$1!_8w0c@UPq100>j>F*9J z6BCnR{dpPT@^31BPCAX+0k=%?(514vmKXH~p6PYoCRGQ3w&nnD#M+bU5Y4+0N(TDO zdsF5D0y>2*uj>EbeNU#$54J=MBwSgqUo^HryTA#o+Og<9>jiB;Ls zl3QlGG@Ax>Z0{(5nvSBT&{(LcA-?}MXrc8b>{Gh|Sb&D_U%#3IRiKU|))mNln!vkF z5hl&$(M~qLvoP$I=`^v!4{X`0IC|=gLWm`g;!s+54{ZGjiT!QP{dzVL0eJ2~01y3U z$ZVQNq$+I6jLkk@DFh&g<>cfnVKY+-%%6ZJ&Ia}bB^dK8 zj2kX^LLGep+|YrI4{4$WXzJ9o0r2@jjl1NC4met#9k6^pp%*okz$T;e8^^vJ@rtF= zPUtNOLx_>5Gw8VqW=b!u<-XqU+hh_Fd2%r^B~&@d4*xw{PeIS}WmJ4_wT{x#HPPc_ z9t|M6gKAu|vI+y0z`h;HJ(P~I^KwkD9sg#YCzStiSqb?x?)6`5~WpPno0PY2;|Pzj*(c~tMWgOR>@V|>9@WI5hTRCFK@dFFDwN;8%#^byIkl^zOCFay4GW#Nd`RMdz!+&0M|-&xYX* z3s^iCAer%AOtmlt zhEr}6QYUTfFZv2^_%k{BN6+kA2A)?(?1W)7-&0J~h!>@oT4%`7m2_=Rz{a@(cdZF$ z1SJg{zE)YDY`UUq1eH(jJUN@_&Dj3?J%<++sQZlvaoiC}PnrUhy1dHlpZ>=N(xl^u zxlfgcHaT~6u?*d(w9+`tkVHaOs0Ag1qv(+#Dp+y;<1_AG+-^_Qnv(*kS!K$HC+*#* ztT}~pDR<3JDn@T#v+(J3M{myK3Jts3AasB#XgXQ2vutloUQB5K>!1uKvqG@(6K_J% zc9*+&X*)_2=h1s;8?tk$sc^$PZ|k*iQLU6;m)g%V>tgIryvUH&0Zj41ZU=xD2P<<~ ztnYBLvJK^LLOwd;oK!4mFQ594>mOF~%89X&>1od1eg#{hTEXH>rxrZZm<9oZ6VtDv zQJwI@1z9O4(xY3W%Jh|db?<6^>Agt1$S^jj{iUGqi@Fo=)ah}-h4s492QwCGd1O<& zf-k`RO;K7VS8*n9DBLmH;K$cJ0cY5hH8rb7y*6FDH3d+lpytQ@$)&b&Qy9&y?AS@p z>TUmrUg89pJ|O`Hv*D*g*Tx=2Lh?8zo^n06Jh_ki)!D4wf!yrX{)nSmi z1c5B^#wT~J=ux02WrF=o;?9K8U2WVQDrp%7pTeZ)PHF{wT#=u()0_Z{o<|4#l_*bz zhlUSvz@Q&t%?oHxf?(H=EXs6PH0r6^hpoyJ!OpS+ktn3EIy}9yOmT<} z9lw^Zh^Cx}42($?U9kCpo-1e0Cuqba;fr4OpzD+NJ|lZ(h-sm6kxn4q6TWHyzFc?* zRdh*U5?1{R=!lp;GK)Jgdi0ID&X=Z?q<6!O+Zi7nU&?##%^{TUKR6B}lN&N|kS8S{ z%Kl4rtbmqO54MTp(ZGy5bNtRxivJ)XaVKZr)Zco!)?2iPNym=>4IJ5r9@;*I&;%M$ zTmQNWrO&emaU!hoUmEi{TybY$^2S;`GCweg8xYBGtDc|u@ygc}wSwcr^7*F%bPQtX zt5*YKfC6IT711vx3ylrS|3}$aH1Zw#My%`wMIN5rsDjA@H4j7mkD}H~U3vIY5}@kP zar~1%HnL}cC{ybJ<)8t|{(sfn|6MsBe2C|AF+uV3P*wC!Ipr>=-af5a!p~<_LT#@EgskftHRXA7$(238qmMQn%u7~M`) zRKref>U~=`CpJLOIt%2MmK9m~ulAILp3A#O z!5Ik;Za}=MSEM(KY4KM-aGi5jH0cYNsIWj}`K)b9*5kYp+BAuk|EB%s4W`uvr?8Qa zWDxgaire7=os_N!Ez5diQc6mL!_6yisPpB@3zm-aXW~RKqE5eFy1qX9Sms6HP~mWV z6D$As-K)d&l6~~f%^w<;eZvir*TNv{Xqw)(o7S1NllsEIML9h!ytfT^;(|}!`sQ$@ zV4(l=^D&;#U?^j;dEmou0hyRAt;;lUP<067kX#;!t+@0f#+B{P@ z)5Gs9_!dTBqXO^t7?l|8W?lZddkURCz!+fp$>wWYRHi3c ze4_Ku$JJ09N(CV@GFak`4}Gip4FV>Z*>^lcCW`dWY-bjJi_=9V3nb>>BW_MBUjdyG zeIc5v>*Lm-U(5c{zJr3!@YeSqO-628KDM@wn~09KP<7n~2>M66e3j7AFwBbiv$o$FN_bt;l%?$A-I$+s#=NT4TYKZzbFH z0rwf0QoYBlnVrB-`qe8OGO%C7&2EAef9Stzio(bk63|G%%!N6YWTRJj|b;y_lgkV9apTRD=ME+*IR= zb0jRR zy*Jt7tDt574T3|Fvy9uA`rfLRnrp09f!*hkMD!yRZ7cR(3vx^-oflJ)B^?2xE3LOJ zw->@4Z+jD!XIl7`l(S0~FhZa!UaSQL1-BZ3eeGH}3CP~A*ygryT}%J;@<`quYsFqyq2EW5yUtSsZqb3WU# z+uMD3$O~1mVOsg`HcRC;t}#_2;74=j8+;v@kA&_T){^oY5lKsqd@Rw`I&@Y6jfN~<1_WPyGUcgyH@ngPR2HOHxk?ZNLHf3jkAR37bPNbO z>$)LkJ-ITA7mHY*X`%ce%JRm%rCz)KjFN!k@%&eyq~v6`k;*SLfhr;K3yqEo74&s@ z$tP!8>Tba|hWHx6{rJ96nO?CFUnwatT*8l~<&95kYnBY){R-PSv_BCoE4+!BhxtUC*d2bJ-Arb z{qA^%JBn6u_EFsMHk{YhEd`LP@xt{yp zrNs-M%tqiy2)#wUyE)(aO1DAx{QUxZA5+X;+WPx@`2E|e#mfbn#^VOzqvJh-(Whm~ zVjM=5aqsXR@%bK^`TrUax=aOG%t&^b+|YlnOHRuf*{l-`TBtL<5?+2!^ax#RAkCW) z`@k@v-@h>USfb^P%i@m{)mLfKBz74&Uysvr*3alZ@SCZtSV3L{WWU_YYWd`R7$cH7 zR?fPCzlcY6@J?0Ge1_v!#vNZVn~#};uHXWzd!UBo*03J|e8a{SGn%D!<#!kFfWMi; z%<6DZ)?~(gr|GfZ`<1X=-ngT7$~AnN@Z!B#x9-_=D_^{27Qq9P+u!`S^+fa+ce(pO zp9tKFbpk^ISA}_|WUB~pZm$yEd_+(~dTgMC(zZExI0x=bg*#@IU}W{EqG}U?%l5C< z@FbG?Q;ruc*`$s;kyWO_FHW-VSA)*vP7|9+fVX(pvhi9Z$op5)8*`fV%R!s9pfhs7 zULB|NdUx9-^X@uurR@i@9u6G3qUW`Bgw-5;U!^+U^3CZ`IXO41goX8-jdZqQAGl$v zbu3ab#vgY$df!fNE*$F)IU}!VZ=F?x3!1EKdo+YHli*MYW zyA8@mkDt6*4MINepSh1TZfTsm$mp&al^{zRcANF6XbXw`=q&slHTW22>b9(JH;;$}|W!g1Rx&xY9k3izky`Et!iP*a8JQa%%T9 zhetpC@>CjWwWiUER>~tTnnHT8IzojjDs9XQ8w#BiSG5TkBn}ODng(#SbkuBP`GIyO zC5VL*!N0LC7@v)SZAe8g?$eiURnIkEOv&N^Z2{Qd_+^J_`1u64SQ9v(UQ$P(vLDPk z{r+eTg3c;uKMxH@q%8|yU9UV7-FTB3+(N#;L3r>^+!LC>y`*?j*MY8cV6zf4;WG2n zf@nDX`ATx!nu{ohkIXc^|^}%$- z4RBolydAelu!d_O{993agQQn?L~k(G&#fmb*EVC~BF31|E^)v()(<@e)b5vn<-i4L zi)N0(x-C%`KKJoP{WqonH*Z{w2)#Q(hg#?FHS@fbF9S~rH__?U7 zl6|lUFjwbna<=NxpaTU2_Xh9{fC(bzzH6fqu;r;d4(pu#LBz(%Sx{ac^ZV1IDj%8! z?kus;zF;eX$uf(r*5Wg2vGgyeVO@;@1aw(T&Ibvt;H_!R21_xl-}8_pRo%P?6B*99 zb04D-FpA+f^TAHp@qZtw#MZPbmFeU|Sbulsr%5jrY1{@{L`xeVolONgAD7~l*InB3JBCu=t_vI=+mOx`0Kh)3sYQw=)5@M z$)H$NwkmOfupD0x6D-KP2^Zm?7F^xh7p%1_PxT-#-OYI*_V)9V5k%K4RFpoWHAk%V z5-#?(6*V>82l<67pIT&{^`a6I@1rJsST!e*pAd7OW+SPAqOt@YQKX)jL=fG8mv<^i zMXz^6f)pM(&-jq9Rjm-#)0}d)>T|Dh2UFT)&^>GV%c$}Mx7c5E;*j8?*f5%{>=&IE zZC=|**RFU%!s%flvYyOv=!dB6MQ?`iWB3x*@eJ3`q&Fn<2ilAA8@yXna%$~q*HG{$ zVJIH~k*S*PzDUsp?;@?-B z`1f;xX&`Z1_-Z=Z?ISVPr>sjLE6Hb~#|YvN>%$o;_V#PB6h*zAs|uLJ^Ki|@yqgVf z-;Bx6Kp{A=!v>Ajd+59ZkGi*$Ag}LZzqvz*#X7x=lQ}PwQZ{7A9dP-Tg_btD5kcz| z3ssellwjySueC}CH;n}Hk`=Pu=k3`kQs7U>9XpE>^5DI*-(&;JRkHjkX{KD6v)Isi z9aECpO$7BM0OUk_cok^@#kq0=G_|ltE z^o3~B${UF@ap`W*wenBaao3lRmAKQ0(J6b8>DSPlseN*_4&zGp9SvwN+ibI7(_Kny zEaIvp$>b8$NW#=>tZ-MhVdt;KN&JCS^iZtRjD}2L(aXQS6@*;(IhG1eAOA$(b4IP^ zd#`*f6^IJm#hR^CKC^UpwY6nu{G6JXDPvY#OPw-7hl)fmBT<{ z9?Mr+M=7leyT8dmvi*MdJxSW=3qeSwHE-n=qI!P9ygxe=jhjJCMRyovAIxDhvxYk_7oF8B4H_1aWe6tyj&YMQpTM2PXC|@{plPrQ&aEp*S=bqwfCPCPP zDV}US;zEMPCWE9=mgui1eK2+}&QIEmmEjpG&Hh}mw(@ddoh<$+;YHaG zH0BLUK05&r{2$Hgwr{0(3VY6sMq;7YP>_+^+!hduBu-9EyU$UqtbhM|=d{1XOUXeb zmKY=z{6rXX1wM9C-^||gnStp6CDO4sVI`fUe zn!&TH=NlRJ+ZGn~%p^8DjnhY6QH##;Exxf#ChSm0=*IGxrR?V-0t zjZSs3-M!O=zx{O(aJP##j?Bo8%ccEetqC|MHVCiYkaP&TN4QMs^WF{cQBWDKZYw^u zvx49oMMqM8V1^r)iT&9@;A6x*-_fHpGU|rw{WLbmlat z{*RK;B_Ayv^t|{^4sy8;Dm#R4=@gS^tPhtPh9RyThNo>1Imgo8b4$*SB|8J{&XSsb zH-h5Ztqnj=5fCo~eBJ%@cOXD`8veG=)qjru@b!RPi(X=R-S4!dTZ{hD?q&QO7o9{J z7|qQ`OiXVsl|iSpMiC58%(VzD-haOrveo*2#%+3 z_$z@$nb_Fbzm=AL9v>fHG<}6rr3X-`17mC5qK=Cq?LOyIw}{XizpMPMOrF#C2P3G~ znPLdmhaauq+3U%fshWwt_{_1{6T!y3k@@mSnQ+Y0TaQtu)<|`~>ISTm$`eYb5N#QoJDLo$QpNqETcbnFp7Ueo4U_~ zXDRkeF2E|qX=&aXrZm!6ch~#ePU=EG*i1X~-6h|%4^}1qhT35K1s8{Q#~Ka{N!!7a za0X(<;l>;ws1BvEo0fS}YZLrFZvC-FhGT(Ddy4&v_f^`DGaIKL?={_nPxp2neR(5f zIx{bNI_e;#RuC`jg6sRrwOgh))|2>e;R2d}L%DvkwS6J?xv3GW!X}@~}QGUsBDRVhjs^7}vuh6688OSkaz71_k_LUg<8-%bj z@?}Q)&ME;Zgm`a{_pza#Q@I^y;g07;Wkh^oRPQ6PxZZ7Idcu``gzTdGEid_YoL(&% zM9J}PIa%W8nPn7aX(njFyHtJn0BOMl;)HZh>?UB2(uJcwn3KgQ(pdC;WWYzC%fADy zO)jhW)Y1HA4Q3)4o{^*X>tv2jaY1zsDQ3FEzqN_F>HHa%ceppsa3ZDb>};uajTGK3 zucbRwZ2f>6;dIgmLz%I%AHUtPuh{w!I}uZHo0L&cglX<~zvOfE%=7uiUp->q`|7no z8G8BVQ_<#`@%?MvBIw7R09xKdY}?eiE}yz{tzKdhJNgY`@M+op#H4~cV9WL4;azl% ztTar(Eq|NmFYKetzQJ_GtUf?wI>~QVf8op$mQ1a2`fm3oXrJ4c20B+HT^_FJ>c|p@?zNwPXBw{G;pZBOyQ`5goA3@!+Zj0uAJ>=G@@!FkEw2yRf%?() z()w^b{FIZqwo`Hrv8A7kU;7ZP&dOe_1HYfBi9j;f4<>d2e*~6DH(k1ZW_piMVJW8p z-~LwH%IbHS=(q>P+?o%j_gi@UGfFdTdN+cRIJ57>8G5gIXq9--m5v-BwKJkLo-BYr z!bwPn2E;>LK7emyAeo%wC#n4r2*f#^ZGOC8$Dd`MwdoRnSrV(I6^?)rF45cc1!K^? zm=}Mu+_O6R3Xw^eLN>pZKkU6+s_i_Ft2KT+LJH(u=0*qUf?Y4t2hM0m`p^S_DbP9T zKhtH-PzqmvS4S<)e9rtR5RhfdCm89JWD|uSn$;76xL<8VxKL_eT@OOKGtQYnha;@S z^XHpFu@3%(EQw2Y&jTmby98X|Hs%Y&QkS?4*m7<9+_nBOlL16l;=*!-_b7#WJJ=hf zqT4BR)#Nllj{1gS;$}v}{hG6hkTIj`ITgb>l6fwBWQ`g6c_N@i=py&;5f5yzd{C!O zbYXw&7|r)p>y-=I>S3^fMqN$=S9i|MBRzVpKa@T@8+g-R`I#d^%(lB?i5LC&uP4Ej z&JF2!4Ti&=UEzLi<4?d;mmcO6!{Q-=Uf&Va@BJ%yefx-6ptW)0w-aL8JrjHMn`=0+ z8m6YBE+#+Lgt@0&VW&Bj-UkLFkYiWx{kh{glV&s^A-(rT-*fjW3$KpOTn?#MexL|pbRVGqKzqNC`%#cPm0uE*Wd597 zY2=#HO1IUhcBWXhv=c~H0Ah@xB`HwD3WRIVnpRO%{AUw4f5F1C3(G16O|5MPfHV$U z(~%4iwpotC&gSA1YFv7O>lpfM{cBh#Aa7)8;o%AsIrCi;NiVI zI%sYRVA(k}1+1IqwXm*tS`Yi^A(;>Sq5Zn|Va3$aK(Zvf$&C zI2lE+ynkW^(gDeg_-{~?BmZv6oub|whk{#=l|HqFWWVKc@3i{#GGNsX`+Sx9HH{J0fij-P09Mr!3Xx_t`cEk5Yo;g2&wjwOblW z?Ao&sf5;{|)0HkmTvlTwFtw_jctAddXrFU_8uC!MY@hn(NTrCsk(CJGFkHFb#|O%v z+AMfkLP=h#u69cRjJ2*lVl3aChhe!umQs~y;ti6Lty)yr8evAGBnY}Jlm2hp?yK(5oa=01zV8Vf&L13D@7SGjw7(;rvAiLA zO3nF45n(4a7PMTfI1_)h+$6nysE>1ok$T>Sa!;$VThB}JE4eG_yz|#Jy3C6oVCeAr zBk|Ws_Z*G=EGyTP{U7dAq(VLwFA7n$HhEKcA4IC(SPuB3qa<#aZLrVtig+!zf4O68 zIq~A*0#a%D$4#b=u6{y~9_o51n(XIv1)fyNqe9ulVT+-N>>*#cB=Wil;^F4s6F0oI znJ%*v+BWGWTzxykwfwAWrRSLzkopdL?gUFrzs`w%QyASf9r&6D z_QA8=V?^Q|nNP8K=YD|Dji(hBJzsKsYxvf^`zf4}+O7Rw&MlIsFJ1|7z>y%Ibj$|1=R0@G#-f zk`q(BM-=?=dO>}p?hje~u=N{R?!c8^-)IpQTN@Fk`3sROhrPG${83BHc%Rx2i!wOG z3}0*K@0pRr697Yce%Nv+D4N@-0;G|jxUW&|-gW_djVt*K4Ei}uaqATU%=hn-wu*Lr zCQ%It@n!*fv`kZi*X-2kD_}+dx&AoIn1duf`ULea;HS>}{%7)t#(@@%mv zc&{$j$z7>A7blmuEfZR!U4uVI>YKWMp5cn`fbxQHaAvYbpl#Sb3`QH~AZe&t3d|8x zYNh;S?ZZz9KI;#%Bfv#1fbpv)10dst!G+4v%wtWd`Ys0(mIunOX!kq=3b#%C$3&UZ zWuop)48r>AW>itFHu)k6A1)p2ke7`cw(M23S@NguW#`ZGVAG{4!n|a>eJ-ua}7yOa^Z&L!iD=5=ZUkfz32{Q;h# z<7cNzJ7@^fxRhx&D(VOL4=o%R&YTHg)uEr%k{{#K( z+fKQ9YJHM?S`r@gk37sxY=~9VP07K*ClRm$!}2BMDAZ8(vV)t`jUGw8 zT${CLGB*D6ng#oiG0lFx@bTe(UD*RJy45gYJ7k!mUtZ)#QHd(Ey9bn+r-@z=GYU1y z!cm{qc!0OJnAJ7_uQJl8E)9U=G|5|}PQ^d?ZvF^4ouSltK#^eI>aa&@k7l2W3C;ba ze{*GJ?~-AN5ml`N93A)LXrhV9X~@jq5PljzT-GLICIa`P+buQwMMblluiP@3z@&>| zq;PDKREFUX)HwS<#hz2MO1u@|fM%7>4(IQRVT6u~ZKf!Q5KMFAta~U2|Sv@{fW&7G-7qTD8{s=oS0AxofUZo)~n^;Qz3T{fah& zmIu`bdL2quTh3OrR;4hOs`3=@_0xA={QB)lzg6EZMV@_TtecMQsnP+xbx;zwE4B3( zABkD{_$vnMnXuZ|CrdPt&#y#T42=(3sOTBcO|0jrwvF{0uFKBy3>+PvRzKA+WF{f8 zQx2wZwRY_QmNESsm6ds1cU`}k* z=m{f7L4rn7fx^OpFUd#vQxnXwW;OjPzxOoUH`cfvIMHKDX3+Ez^p!AZ)Z0Fgc=?=H zbdr2o{-@%BNoZTW5lT1o;E#c2tlI)Mctn_qm6ys8FBqI=BlZ`gnzQt-zk7I706F2$ z5^0V4XaBjL6C<=>ck3CkGIy`H`%wv_t%R}HX?2kz2Lqw+Z8N#<6;lg?B1%SzdnGlegcSXz)ljN6#xXOie<#5&b?!U z!5;GCjQNhPXrM)s$1T4nSv+l{{AmAVC0i{qwJ4703s1#Wt;TapN7>Q*p3A)lrrX zapZD1GkM{H1sJ*X3;W!I-^UlCybVsSj4%pGHFoZ7hF|qtRRr`G7o_&I_w14hvDA$E z>pQo-j4j-wZ{q|$Ql^EKedYP?&}TSengbxl2CJ-q7_b19KCJ+}G6Z|*2^G_BhsrAD zQ(YY+0e0+PN_i(q8gtoLjq%+B;;D~se_nfJD^01U??1I6vAi@GKvI-kI6av;6@K}) zu44Ya|INQjcl=(D`cvf%H-y2_NiPjIgzvi;qjvqhOkUsH*D7A@I}OUE!Z7^`WK)A? zzxp`L{shT4D*WV#*`=-TjSP@}Wvb3g#|%|1`M{5$PBGGP%b5Dbh7H{;defF_@7voo z=?H5KQsTLi9h`kReRI34y(D|DI^EXPhtM5wO&7lRW0`d@ILN@c2%KD2Zzg%bS1m1` zCsS|;m8wK6erCSX@1lBH9VYFEKA?`5a?q4RBkK4L&Kb-%b+jH3?2ghDnt56R<)@P} z*gMPH&+!a+=vbCoysF>@GUDXdRK(Ujw4W0OA2n~H*7ySktTL98d%oOm4`*GMhW9(TKeh=X}1qmjdoNy|-GctmC-<3rHTNXk*9DKI?fAe|wB|${(1P2ITSw zr3R26YLy=B*As&iBjWcw@%A9m*-1)<)JH|KfAB2e<|N?&ZbEtCY+HYA<+7&4LJ@wh sBF#S|)t!#NNkWAHS-A`@3u1b?>@&t>5n-obk+go&DP96?;GT#%ppS?Z2Nj z_M8#+_fPT?71Xi*vI4>pm-DwtP+hPz)8D_SfCN4NyN#Fc(Gvf*x1I=#{cq!6GyQKi zywWQgcM?+`tIQnv>455|7ai~0$vwDO7x~~%b8C}QC|V+vQdix`ea2D_lThva`iM$F3Lj_>S|}6r{3|YC zq*=_Z(NJHYM=N2#M?U%6{y-#w&N@E}dJautxa}%9TS5JO6TLJOWcZI4;?x*nkogZe zL}0`p2lcZMX@ebrkA$v&eD{kOi=9vi_y`sG)B8ifHBd_6<5#Lbj=2E5_CGvc&KONc zMKvX^RMhKxw%}oUpvmWTXu)sx74vAfaVbm((MKEbWTIFN zKa!tUHA+t@MaVHL^~oE}7xRp+>u>>+0HoWCdFJ5^k%{Zy-T0=?svE@Srx)wl>9e!5 z)h#nuL(%$1Mw!&amA3sWJ(2bc*B3{-KE^`sCaS5AdyLBrom4XntN}m$;|>O%psgr= zI4?(eb&%K5h)_4X>o8wm3bCp%J*Pv$a(8%oIIX`hrvvfc=Ofj+3q*Q&PX*%Bv@w5K zd2PhSfSSJ<>T2mGY|3iC?M$$+u+Y>wj95*4?D{DC=3%7k;UGuV3l`7iu255*eDRy> zBdd$GWcpDp@0 z>NcAfXSzQYTJ$u3VfgJo-sUeA^_uM6hB>(CCrKrC7%+xNU3WiChxORYw4>{s?JZP1 zrK9*5i9iDd+5PH|1N@6^q)uN%z&%g7*x8LV@KNsVC$}98ZcskDKz%vb~phB>^7~&SY^6>=UdP6 zanLp5mX>`!#SklQmgTwfzWlg6R`Rl6w0S3X+OY%auKS#(#Hb)l1cOWxc;8Xa@Us0c zOUR$R{@`^orC0a$!5eeo1AeRF>OQNE+?+0Y>lDcp(yGn8tBbCe2L=^(Y+Ub*E@BinpXY1dj*2YSqe#j+c zs5?Jlfw?cZ>AC(e4G--h>4hp}Q<|VnRr#NA_ylCS!bSA$fp#kInngd7k!OVkPIFY$ zf83mFtnwT6Z;D+XPK|#!^gaTr`E{kT6YlWM?pj^|evXNw^kWwj_YHJ6M!nl;4Y3JK(&V zwmd$(d#@N38dMr^gA#1Rajv=AYfT3R^qgpxx~8U?u8E&p-IT7u%(Og>SHZ<-04jdw zbXpA(A_;E#;^rkjE2Nkz5VcguJKk zj&^X^5scJ;=NiB57FzsK6z6Jycg*Z893&1SwxL)J2iWc}@ePv)<@oC3oHkeo&gT^yw#x$I| zyrO9(!m9rRed=mxNKk|yohE_g)lOaPuA07nY6T_ltH50rq|-x6!Qm|a7^xL>>96M# z;VObNUr$Fzcx;}l)4$yilc6Ua=6|*^?{~IvDiIW6{vnLC2(?C}4PB5po1chjR6-6h z!F~dhxqcJRRb;S5l~h*i;^B zf|~h#L#;iWN`aNZPO{(RIZK8CdrEr}F@2tP)twJcTbi$mKm9d5FQWkJhNy_y}mlVyE&B!nnZpt6XULx%-a1qg=B!u!OO(xVROA@Vyv}V;&VUB z#DrmU-^#=oVRK)~#GcgVGi-ls5NK;LYICu}t#WA0ad4}0@Tl^Q?bkis+BhP~s+jC* zQ3+}RwY4OTbJdE-miTx|1{?vdfilGOKOZ?n2aS=AH$#hr751-yv6J;?zxaB2ezic< zN!0nNWAhrxRTk@Hnr-EqOw@K|D}1j{wv6=|9lhZl>7Hg|zmuu&Gd&`1LD=EPz__bh z2r!S7`}u)bpm{Y$?6D<;iGYv0^y}x6{1{PoE!v>q*xyb^?mzI}1%A70qV60+X6zWKc(-ES8SM1Vf;K4;H=rUa~3+2jUa zvx%jmvDne3{jYYdw+XIBmT^iaRK-RX`MG`Ru8&3=_pgpO&82YgF29@;Uzj`N zgvCsO=0&bAIKWYPezvVuwc;%s?(wl``Pit*r{Chk&f~F!1(sC*b z%Z(CtH!ktKwA{AQ+}i76pc^ZxJX8(1TY`sM1kC=KJnRi>$@DY?AOjz_tHPlS`?aZE z+DZH6DQbdA`@N}8p*b?Vx}FXBdLI7j>$LkpIdCbO6|y)Husuq0|MwtYK(+^ln<1%u zK$WkB#;H5cBTJk&0jXqv1}V?4!z-E%vJxrXcn>1&c)b zkA~X)q}rI$>nUR1z$W!nJx+4-mDElZpRcqmV~Df$HFCStbKdu8q+x`uIO8LsdO;R_ zb;kMXXF_pQ1H?;k;jNazQbGxck(r_Z#J#|-qRo{3)~xs)-( zJ);Cd>9t64u&ZI+%8FS!ooZKUXG*U0kPY^w5Vd^g zG6-9q7$dn~?lpvq&M+Y;p~%3oZK_C~5cf+QBl11$&XJ8m_kA?NU0)}45h;W|JS*JS6M!LHC2Anp|1`|t?uNJ=jI%)zltN0 zPr!T8!|fMc!6)BGs@=HYo-VQeTl5v=AP|#oI`WAmqL9$Ybu1-O&v>T119gepu;9I2 zVeh9E6akHB|CrGv}TpDYs1@atmQ=O?I|I7nf5H&*t;$4-vTfB63U?V5 zIFWs-1O%O44175*(<>1cw6>OtQbs-FZ2?m}M#vdZBC4@X2{Kyyi=q!tfFopW=%d4$ zj!%p0wlN=dpUZm%ZAP`xD5KYFIxLy|ub7v<^nD!HtTT)4)Y5g`A7#sKNu*5w81eju z^353EZD^fIopm%yQuuYUDNm3<`={m&p2YFDsGO~ty5eZ@C$g+Cg*9_p;Sy3~&s-2e zfoz;0^w}VoIPhq%xaw{sx|2U92O4PC&nb@(U3kTJJGSY~j!>9Fr|TFgEeYQjk+3Ap zQm4d@UNn$vJsA%zys9uLLY4{oO@zU67#P5eUp2Kf@Ir9!y`St2#uM$A>!&Kn5^??z zS%e1aE65%e8|tYWm9!ll;ff%{W_KzHfRIuW-x>)m(pU#ZrBvR>DbTyuRLDcpb<6|w zsk<0dul&zimu9Y_-b&?U`_fQR-R}(b{DpGu@N+|+Os{{=_9f7sZY3%l#QPN-hD(1OV zpMNT`$SIbGLl3g}yo8AhJi901xZ|Sl)>u2}*S#w$X2em7&(5vpANI=4^8I0{lx1&A z`9~4TJaZtqSdlX(E&T0V5+M9rRlV(Z7mhxgdga67?-l)FuU_Te!RQK{WTol7x&GUt0B&n>;Cc_W++yaT##Z{#zHwKH(K=1G?e_UT=&3n!oXO81;% zGGO?DZGQ2z!mn8dQ}WLmJCIyjaBRQh@T;ZH=l;j>lhT;zpuRPlGg;m%0v(LQ3J)NoAHSxixtwBGQL!fM zr>*4ODd?&(LF5Xj#CAXT=s=xR*?+|h&+uynp`{aR2mX9YCy;bGU8cvg4Iv&`5h_-Y z*qQxgTO^qs6yeHrPlgc_BfD8c_y|(@>J2)GZk9mDkT%a0le{v&Lrrt-M;PtU3B8Pr zPj9+2y=KU7CKZA-V(l+};jigz6t}1~cp=}H!)dcXY{N{jz$43tz{LU-pTi_QjLhJ& z_XLySEz}%7J;H!ExE^B+cM&!TCVON76^vEiq>1YMfYv{Tq&b3#dO_hM+b#aF% zx?{hrX{h^cC-c*2%#@|_RVZAReV8&IcCkL@N3rXYr%_B8GG4}*FmVp`1YoFLc)j>e zW?Db)EZ0FBuvL6?!8rJlrRC@pT}Mg(qb-)wtnrhG+gIR4sO{KxqWfKFSF!*RDy17~ zK6>wgaA>ZlG}&^sXx!Zbiwihw+#N8-^8V>hb^et~6!*5@xCa=&?_=fTi z7FsyE##g?sBQvUV`;uO|#OS5z!{u`m--A#;YE(53&a3Jd80(_`*PiL%bsm&-bBDqM zLgoPUo)7ySH*biT!bSf`Y3k<-|XOa|faX5r~W2onzKK&JR zMEA1C^W!3U4E__hfCC?6hhYNM43kQxZNk9GIF~+Qf$w-ts+<{d9M5|ki=7<;e_L2e zqKSjE518@>8zg~@JlkkRd^(nw;pxRSU)a14icfLq;g!PIkIg~#^7WS;o!?*H4(on; zn6jeR;pXM#d{D%N`uoGd$oxav@smif&p>uKNM!m9V#)v6FMxVRE6&I!9|RikFfy2H z7Z(xNF%}t4lpWn$iyVrlo#~g;e_R*ks(3NERD34D)3!gOfxJIFNfQ$Ea3!jiVz)1Ldx-v8DS?$)3L*|gh$vJrToR(HD>i=6eNY%$^>)|( zYcbJbE+rCy$ro+8;;>V_aTG(lBNIC9+M=V4Bcq z8{O0R$bKa{*5R$ym~F_IMx?!X{>^0Y#Rqz$un%eRB|KH4?-g_yA1yjr8cS8P#gPqj zQc`0@zHgXE;#PpY+NMu2s~mJ0o!ZY9N4Et%(~P{En#=}OjlGaB5}EWtWtefhyR>)v zL@YkBaaVnwoeSq@4iFLzXWtqX5ou5~m}1<+nHCg<54m3*{?dD&a4s`HQtD`O0kh>u zBKu2&5@n_J{rwJ^t2d5P2saNE9v&$nqNdeV`jr;fwZ-GOG;stpG5J&HCBpMn^ zF-YA513qZd(72@{n5n>%X)B*@iBRJ?kFT=@c-2tPPsW0yt&L|Zi-S#xMnOAV)I@y- zbfXy~x?fyx!PEk2%SqGdXvqeFy4%d*bS=+()YMJQBZrD0xctY0bH@UQ`_F!n|2caVG$?y#`%LFVro|!1WY3;o zVB-6U01=VDfk{2Yy~t!8VN-MLH(A-#PRmU_Myzc)(1~imgRq0cI*f+mTV?IDl^nI! zXBjC{pCtmuntu~>9lo2`+R{LP)$(q$g_N@_udvUv@UIJ-e!uqsXBl&1O{^b=`k#f6bHjL7EI=%AqU z+Kg4!#vhj!Nwski-ia(7xWSs>}Pem z{6VEIQA>gvv^`JWUzJWDZ$l#gQzbr>V}-Fk6K2|4XPto(B4lBPrFzwdo^6Dk)%KKx z+MK5!6vPlf%p_RnG(T}w!NFbDLT4m7QDFu|tI4W}S22lYtBqBEI=O${K!C}P2y9Mh z-sIB%NjLVDs?_j2h<&~z*+G~q&Rm?YQAJMOD>D44r~r=}3N$dpwpzteCnM5A6GtF6 zlPG&7x}7PoN;`;`P{-qoIgv*^d7;I!~UT z*xe*0^-PHzmvjhn>%*$T?&`}ThWJf?J-ow9dg&?m|14N|o2K!hqhbSXa7acdy zF0dVzw(dvcCVxx9T9@p$lYXd&SY#5&f z|sx6MF&TZd+p8Ts=BdC8_;ASEz-z8Nh7+EzY=; z*Ue$keY0QRM)@=JCFYxVo9_6bZq2WZljj^OX&*LdbWUc}VQspmMM9lip;U|P0kOnw zWsW8S16+@ntCgmc^(1Z1K+#fh+(|OAl~)3r?emJQZ(H(r(|J6p#`t0tCU1W1AAhVJ z6dTv5c`#Ry9OGfOt+iKn{PDV>A{!g?`im2o$#){IuT*h|L77mcI5+0^1Zi%wGl=WI zm8tjurN}gs;I`!U?>bAsCl~a*{_gDve<3|ps6?vu@+EuxfZ1zblowt>HiisT7>Ji< z@7%@$-k`!gpHyY$RiLH@CGG(wF1&!gWBS!CN}qHGfeU9FL=|^km3u*D-fDh-U+!Vy zXYp9SJbL}kpK4p4&OFsjvMd{)05j6~?}{G;hmJx=D}hLxe_}uMpVHq40|o=_f{lc~ zzjN|I3X6(wu>UFt!pXHiVM!Gk7{={@;{~jJsQs4Tn5HTx4Yee(cyMqlIZ#Qu{;4ct zXVC-~my?KyCNmC}G0#{|Sw2oJ-?j72J;BOV?#of{#Y+Uh1y^(Xoh~ERZ+gpE?`A@AE8kAp$Rq6^}M@{ULq=ZIp z59?`>rtz@wJaI)#r*78Sj_AZr;lxNY*bXaY58GEGYwMTI*V9JZOW}we?``(*NB5`%Cycv4Mugd0Hmm&W0b=bl`Mi7KH!eIcgY&PZ5<;zHEGlcAnjjKDUpK@bbiD*)CqVWU26dEpc~il<#9? zcb|^r-#t}plbSpJTs3%1p=oK-L5L-~W$0s~e!&(15%yf*m_6+jvQBMmPqC))_i-}tfg~IFsjddA1>%9xBleUiGEQlX`DG{HdSJ&&vn~n z&03lhmn~Sjr>W^s@>$skKkcw$>?mieVH2RhON8k zXL@tHO-Y!0KE{hSuI6$bBKWTit70mBDifskp2fuS?fDyBKq*``>T%5n zQfjhs+BRtK?Y`T%oR~JvUr$NTg0~lCcK+>oC&2S-nrlv;spb=0@+}~Z@-6*OR$Ie8 zx?k0k%JuD-)lBSq$mFzfihajcM2}uFhK2cw_c;VJ?4NdDXEx((m79;)Hl#~*aIf~q zd-qEcIzqV{vW4Ap?ZdmTQF=%ACYf4`F9vDfM165DNT{hw^ZKMa-_#CN(em>o$K3~` zdUTalptH_2+-ro(0%EhMNLtd_qcn<73``$kM#8$ew^c8Y=G__mKC>Og`xezt=8b)b8xob${^E1>kX21ONG&Qg}Gikt@yS|m=C;Qc>O>w$X;^|%6fl5oe$bkWF}09H5xZd);b;*E`>HhJ zQj2^cU)93J@tcrX)|iAoWpwtHOitZDOG^x#8OED;Zvt#h;}qEE*YR!s?6REG8B}z= ztjxeW?*qP%c_$COGU}KMa_1f<3%RqjHhta5{gMQp#%eV1oiPN}3W}$g$}W%5A$@<` zEOHu%8Urc`R3)1(lyY-c+6*{@{C;P>#@{DIJ06jpBM6-2iO#u~XI&W;`c$Frcs-Di z>lwjs3g>%J13k507;&bJvAq$t>%B1tauB*KPpKzFtx()XE}0O8B%e`tDxA(9pI(x& zb$MTcr}Hch8BGp98UL{o!&Yp5$ZyyzZAMSi%Z>G4(XG{pA1vvFiFxsNWt z3KKaWHRfm8cB0;coBf_J!3fMNdD;o787O*e(LiVEWkXQdvDa7tI1+M2Fgax7*3(1aRVKPd=LU0bNrlG>OeQ?Dc zCGi2hTDEBQ(r7=hmsEurSkNWs9OmB_J}WS<2@L6=C4oF8pnv@%n|3poO0*@%EGi3? zBRLqxC&ZB*gnhRsg9#GI!C)B&C9VP_;ho3?VZFXU7?^~TzX*D=V$?c!3JKsEJ`8LuAo{Hc8iDqv7 z@6HaGSR+cQDl*1QWj{zDJ6zS7>xLu8OZp&u)}9oby@9DMBJn(fc=#Fwd9-OP-1xIN zk8tg-kbG%AHakL-P{);!`ci3<9gwA8KZ@jNI)ma~_Gdg^6k>rHz8!c^S$P-7xarZL z6CS9Kp<>L)Rrh+8$|%iT5)Bmh7;IGu=j!bQKl<$9y3r)$xsSsRk8gkoESwJ0uKF&R z(F4*y*dHnI+ zko~BQTRD69hLhBTsNhj-_W~n&Dgx|sUF&?Q{o;)C7+625qe22^;p^pndDHT|-p^!I##D!fQc+5!Aeo2j!Wrp3 zl8Sc=;8>KJHS>c$<*e()4_D|IFf)Sf=fu^Q!a6>C^tr9PTm3z25^^V3Hq}6O^v2XA zB$~&8mn7V2Oe9YTO0n4L3SNtSozbDfZ7#Lk?wU#3d%-UVvD;|f=HFcq^v5>b%BVXV z$zVI}6uz<-(Q9OE;qQIFJs@yKn)F3M;;s31)b_8)IYU3}Ij77`{If^9Wb!_Ho8F7A zOZ1Oik<~m%IT(X*_f@nlVN&RH=JxPKO>DKjHSfRkTt|I?LMV^D+;;&@rj7>{{WBas zpI{0T~;+CI&r!w9)q4D8SGSdx(h=>|QusL^?A%3kvB6epcN155S%b%64iXsir zHiUg2{6$t0$}7LPWgPz`B$j__;kRG@FY%U;`4=p;i*YAFPaf;1Q;r&BwE}$E8$Q!_ z1eqDevL#o>K+o8#NYcWsw;b#@8KllEl9m&++2QnFG6W!lL<^P|)%cbL#T4#CnUT>S+JT_{1 z%O_9mmT=3=3clvJ6W{$rn0J6Ax-O?ywwGJ6&^3g8$t^@FrL?dN%$ z3#}e6GW7fbx~v+ea=zEPzw<*!PcK}j0`c9z@zdGpo8n9o-PCXz?|#Dog5v667j}sFU_3Z(6k*l@Jd9~Lv@%~x9lobF`B0ef{AH1BL$d|`I0!W&rNeftH1@)ns)Wz#A`GvE;;xF0pty5DZQMCXYwT4_9_NAr5`^Mxt=eFhyTGt zr;uk$j{c{0*`Yn;z*eRtlZ#VYob5jw!3HV7&+fU!USskjwZSn-CjMSW@!8j34Lugz z^mhT=QF3Uzye=kH+^cr_v4Q&CIK#{HALg+$XG{K96(D^*V=v1h73i%P@Rs|QL4M6I zE#R?dxLtj+za&*&8y9v9SNh%{&r%&b0&cFB<{WEHJeScYKH2KIPN)c_n%rDYlS`ih zCFnGOFZ^!TcJlrEJO9H0mbz0#o9QumT6>_QVs*esNQZz;wXl{a)T9LDshmS@Dbf+Z z6D)PWbeTP7&@6^a-S}SNqHe*8TMBee@{#rpazDUcl=v176zqwDjRJGS!T%_IDW$Bc zeRlHu_gU|A;Oz}#v1GhzSa3I}yFp#01F(>g7RK>9fV%RF?NkRN1+!UY0LTb2oGQ2w z-F^);*@(v(fUJK4ekio$Tla^H^xvo=hDAxi`Bb^COvT@y3b&&!Z4D${e)TJp<2`<3 z7I@$9yT0`dau4VY0(fEl1$JQOA4&CaBaekCs(|YQtK*F^{h){^pMvovh57yI-Q3&| zm92ZV8jQV>&R4)pvR}xN4&HHo@8FbSo$V+1qs3U=Sz?10`Mav`mTTRo#6W`46Ok@z zPNOxB@Bu9L-W_06htJrQRoJDmpV^rI0uxPB^sn{DD_nRpr;pCX#SP1k=s=H6|l)Laj3}KGJqpkd5K`)SP=6skgL~Xf?k0??4r-L+Q7;WW01o(SmR>v&dt{4B?OmU0C5sj^nAM zL90sutWJG!L zi{Zv&2noW@9WPd+y-GW^t!is<>n9)*_5dhz8983w{RnS3_1*BJuoUO653U7S&`yr3 z&t}wym)}!zN=r*?{W?WDkf(DMcb@hyJBbO8nq04to_?NTIkIw+BaAx6uy(F$JFPW; z_9*yuaC%+ui|T-JK5D8+FD~I_C^nT@??#+FO^qA;qNm2%V3rr-At6E@CVXm@Zf`}s z(o3b?8~;ST_kDZz+i^vA-}cKbj%`d={PJc*TEl)b;$4lz^hZo8S1+f%W*TmZ4@a&m zfz8gCfzR+ICb2z2?w?U0*Noo=MXb8E5jY9|$arRs#776g&naxnO=E}Jp1=KoR7ra> z5F?^N1D`s8i|zkF9nL&`Vh604r{082=KepJ)9uW-+m1Eo8rY;|Z^$uB0z<1!zA^|) zhklcOab6MtF>N>&}OW#3#E zSFFF-nNZXE<))ueaO#emPI~1ifhq zm{?bt7!OQrxGX9#^*-)eB;iTS}tLLGh-uO<>1b{o-fmol1{MOh$SqC4+;jEi3eA?qn>WA0~Idb#py3GLdeK2#g6Hv+8bn@$|vC43AVRIc$7aEnr z92^|n@sV1KZMj9jKLcCssf4N^{K*(KooGn@2pQeaAF&;nL9NEXi3npk^|;y%pq2Hv zz}bpMa1-ATn)Mkh^H)(DIB}SCez?-JWQO>?-%U1ty0WN z*tINo69%h!O*Bv_jJTrigw%h-;M*qg`iXj)BkH-o_Y6S}R$aK3ArpNI1AQ?J1`w_i zw@HCdp4b`&EMEZxu~I~XJ^6Av1(0OYGxCWWfw4v4Rv_ijS_HsiV{Bg&@Kj=i`QKz^{Z?^NY0HF*3kcK(NbfSMtsK6c5C2j56Kg71E*gk0? zjmwXn|8+uF^}U^Kl|#qd)Vhk#Ey&;&uFoy^gIY$~fa;a$>euHue)^IR_W_Li6nCcOB0()f)@-gv4QU!)56&v;%0( z(|z2QJlCMG*<)MERC^BTwRmObU7>w3VTQtp3os_y zs~g`|-%fPW7ja)C31ohvC3cLHt8&V8FVB83o6Z0jyKfksX8|{70jC1?Mx}B5B$%%P zgxd-}WV#Hp>CYu6Ckt`FoQkM#aSX22^ng6M&&)b-DQ56}Yw9i$?c;YxPwx=q(r<;b zGRP9GlNA%6lSoa^bD-hj)e&n$?op|#kA?Y-PxpEPv?Bld@-vE}R(CE!?e^e@kQ-Hi z(Y~4mR3d zVL6mJ#}U$fZQqe6D5LqWcJaG0yOPg*^*h&b2@ay}nj>QA%?GU?H8mnf zC7kRU7>p7WIL;sjKxPoKu~SYugCSdjkxA=Dl*fOltH}y^IpKRoeWx19lB3pl7L)z?!F|_k4MFm zY$J!Qv%$(wy6(|&mFc&T6%^q(dFw^%sXF!ho7x3fr(07)qA*DnuIFPJIWB}ryt40` zjVf;d2vIYfKl*65eUrsIn1DX-CrQF9Rf;ibDZjW!Y~WysHBbxQ2}aR}bv@$7h$N}z zS>F+D%8Fw9rKKL{KiIZbJgFfUU;u{QBISsTxEK;{o|xZ1kA6N?rZ6ru7_3@91CX`x2lhgEoP_v@c>+Syap4%M_>p(CkV5?7GgCAF+p}4Z=75w! z;xyeM!yS?KQ}}-T`1^(3NSIZltS6utc&SkT6MlJSR$ z&D+V&D9+HPvq$ku5@HO4u;(*s5kTP4qJYe=Al9BYF>G~`#(-zK7ZfA9NVN@x@*q)! z>%)Sa3c=N9^h6_{%%@ao+l8O{9?YzHKgOG%=8oonr0pyH<0d*L_`M@2_c|_&P;7hs|IFg@jVg zA5QpE!u?CSMa%{)L+-_p+oxOwJq!Ed%-h&y(%TjXPV46EYTIF}cBf1w5D_3?&IIgf4eC7KG7hZzui<=aQG~ z@XuewB2FA!A4z&7p+tA_IQP4mN(j@SoTriZ;m-4t1le}M1Wt6~tVeope|et|y}WW# zX-X0vp}KFn$@5X0)rOHWSlOcxxUXTy|L9hl~M9is)&3vlU z@MnnDeh6XL!*^evJ7?gZ`3v`95*X(app9cII+`1N#p+t(f-diA^tSO46k^(V1kf|T z2n_WZ53d)Q4f%iyOZnXw^<>I$v$X&KHfHB?&e<~huQrw zUr65U)ph#?|5!izwddi~B@y-D(2HJlcaJ3`*B}Eu#kOB!^=KnMcJ|2WOgN9zFz(5p zLmDgh@I~oYAnDlb*+u^0R*JuqmGIt_I_Q5h3^){EmK>q3`&2J$@@)2!3}QQv4;O_N z(Rt4X6*iM7)m7wI^wn{_*}Hy!hiFFji=dqA73P87o*mAg*DehZ4bkqz+A^HB3hx*x zdP*Z8GERfpYtDqJqFz zRq8ih{@9N6x0}pgD#va73K9GN?)!go85XsIkF)B;9VbSO=tDMu(E5GT*x!emQ_KCL z)+>R${-x(PEJBul2n?XwkLW#0`s>R7j#8-fB`j`)YtZ5Y{+|#B_O0{yaoVG^X2~0T6fI=K3c6JrQYV;?k~Or^Z8)> zw1AV-t&l5Gk$2ejd*+V{JGQ)Ru-u=TI`yUb52x{|Bu(x1)H*;i6mWXI!t48E*1|1# zf|ITB=V8=oA0tZRpB3L;NeWF&(4!{*M=pK>@I@0c|6b9d`Bz8=&g%Y80FD2-sttfz zqOHfdHd>N-z4KQuqc4_R`IQx81fA<_vr`-@ec!$ecG*d?w)(**=IE5TNg1tIbyZrs z#Y6Uyz2m%dtNhRvqj1pY!7ktIBE?kbT5XrWKVm|20c}D8Z7p_otxuLkxr=9IxF8a%9|8gp&Nt7Be z9)H8vr8XpUucJ6Re&yO6t=9J-0F*+xF(Pcv4UIt*O;3m(W@8)Oyxo(EJ7{17#Iv7i z&bnpzZDicV8jz1`0kWiS(1e_S+L!YEcc*|9G@W(+|LP-`(ANj^*UD`vF78MC{N*`^ zT8o~@bDz4AI$AiNU-oFsHG$VHnz1nLs)6iw9IbuhIBA6Hf5Pu+p%QGKfLk>{1UU}3 zr}gt_Xzo`^Z2gV0(c;VNjeuE8rkKx3oI0@IJ3hX09iEC6dZcsTq=N60Tkk_!vFZ8a zx5$e)&9k@lu`CRHlL=(qFWF%~^iIlqQbqIpBW=m2$s6#e#f<~5t2jtgN##fFtEI}B zfDo};dhEqxRY>Qx>uN@~D}W8Gz1?q?+oSk6yV=;bzLqM;?sY?1H~ekm@&pPGUgbNb zibWZnKke>MD|h~<6aowy7oXiaFF{dI()!~Jy9{J3^N3JYY`Qgq6UM)Tx@&~9+qx_x|p!&A$OQ*Z#*k>8>JN;2y5?q@hC)SGLIq3`-SJd~X846%N+ z93dPFcEW6Q`>rYNhK!%EH<|Y>xmz_{zYgD4j0<$xCBwvJWXsXD8~kyC1GQ9M^LH%u<+*kg4M|;Nti2{!Q1%thGfIP{lTS}l$bNZ~0*~KFt)rPcuF8p&N zyu=RTtf00k5MT=)dshedA^GC@D>K3MWhaH zUVB6CgM&8=nz+JpR;ZBvA`F_VXrR;OuNQ(Xy|#G@s;|dALRd;%yCt*Y38k9LMWolF z{OBKIw7oRXmxsi#%D)~-(Xu`3zPd8bw!G@{YB`3Zs)mO}@xc*$E%dOBd@(!@(#Mie z&X5;cID!2x>Ho1F%mHNBNI}us<2)sZ7?h4arpsG%K!J8u05}(;>@V@goO>Z=>bDcz zc}Ed>#K2~vqQbQp$cq_${1|X(ZId5QNsi2U=%y1HTWmY;VMQTZ0GWBnZ#tJg=Y`r0 zOa*v4=1hpf6Uj7^xLdn>%;>pCVzMFmW}Bklvd63h!4N8@udlfCTjz6bJNr7SSmC z7HmhLq$2AD#fhr`4m(UC?kxYLoe(qEB9f5GA1rM4JCXfy#72NX zHOFg$`L6eH8O4{uXB~m8PNUz~XP>9OUjxY`Hf*+2L759++e@sm40yw?DLcq{Tt2UWy){B7{cmK(I3|(tH6z>Roumc*QkLr8J zkD)0}zA>GSYRxWS3(gz@R6lPS{>Sn48{H&FOy2*8xwnq0YWw>}m5@@{G)RLqNJ%$r zIs^m-Dd~{zR*~2=2uL>qBGS@=(jXz-Qqmb*UG{twGf0DPVSsu9WzvF!L{YPJ8qaP3iaGPS-dzz6Aj z8~NT<4zp4Hy=xPpCyXEa`47q8Q0V^8uCcE%$LSERKh-p>=hB+I#rmUCPV;xZrDZz^dFr@zoQY=cGqA;WIX!LqDLY>wd%ZVD)R;6Q z{k2ICfUyJd!=D}de+yN@G`-O-_X@-s>)O@6 zI|2DJX6x(g;lihW}jOE5&vp3mdFC@rzC-{Nx~RA5A94;NgDd(;C+Jvkox z9}Pu&O80A957tLmKNPVGE+?Aw?DvaP5pnRC_ov%-0CLpu&WD~vE)#B(?RJl^{-`>( zry$c}3fTFHb(YM`%$AQAqxxB#QyrQymO~iR(ZhsMucwl~qM*}c;p5VhU~CHO4(J$?P62m6e#$$l-O>P%+9aoO(^!>Ft|_cy3mF+1?FQ(egD9X3 zn|npv_6@s1AIbRb^p;N>4o7v9ytZ2abF{K-A%2!r3uyHQ_6Wh_x!1iQE+kcJc^sr4 z^nz$5QX+TE>$&CU>rb@aRZNFC?{w7|;T8-#{?@=J%P$VnpL#lD=qlD7tY7Sg8A2pH zKV8Aw0BLN&+!hR$zL;G)<2PY%F#)5;!dd{b8ETa;0R7qnI5C?`pjt@To~gI!hVn>$ zpL|85GVPvBmLk-Yn6(FJt@6d$PLsXg=}<=UkI&vIAS;0BQKRJmEks17KruhvKG(;F z$n)eapUfT>k4u`+l5!C`Mw%b!G?(G4V(Rw~)hx`35}Ry1VrFLM67@RZ-1t}x2$}iS zK_Zxv_hy-#Y^rj)$i(^SSQ8&0>JtP3M_(kb3lc7RU!hT2Zh*gy6%iCAC*LKzc?QC^ zw!8<2i<>UJQX;GSgf_-4+oo%sRcK>M&*mGzco_0n4;S=^NfB`v*g)(a*+Aw6w(#aN zO0QKKT{%swmOK2FOW>l`)`Cm$`lIgsf*Ukm12V*BKdwMl)&sy9r?1U6HHjAIG&D>f zF)DMoL=%OmX`97sZI*$|iYes`BOm~%J`PAn^c1PiePuDS#REWQVx1!r{%NtSgOW#ht1?!Ua_q_<>wYQHEtmvg>8rQ! zJFKkq<* zt_FI)_|>`cixAp$ul4swsTRG-W>vF(-upjeADY|ti}vtwS`FUY2iZt%MGbr1ZVi#5 zN7_8D>zcY9H4aqHY)5C`3@-=YI#Suyt>qng=-PhL8s*gm_G(bi|9tHUqMI1B(|+Hr zZ+-4AqvoFVWQ?a=Aw6wvij3}kJWzbmTcE_o_bWhe)2mi(@wZ#J0R1JtYka0Q+3zI zq-1+8(@wf%J37-2xn%o$XJllN(&XsDy4zuQ8E-?~jDOw8=en8Cd!n?u6CZN=bv>#y zur~NG)*m0NKkcTA86kMLJ->F|Tb=0zva~){tPNfsfAh{}Y0Q+8m{UepzpF6;GeDrG z8ld20=arWaV!nbBb0Aemm1s;cjdV~hw#|ClX7#M!R3jw0u%{RVv&KN*j4?LRzDeRn z03un?InPyN3@BuJ#H78BHm%MsKqS-n<$P-~h=2k~S?OhwJS$yX;HvBwxrhb>xn;!Q z;2@ju0)nm_V7u8Q!p|B=@`(+72KrEqsRZU&>n51ceWQ?|lyP!GzE5W-SJ= z^a-3b0v*SenB(4B$C%*@+l_vaedVcJU%!gM8pxZtBxRa#Zy{LV-=Vk;ItfU^@sUc* zr-p)OIfN@HSC;m+Uf~Leg0gx1)QVpCN%+|!cofT}PD@*`_iqm261udR}P`9a3 zK}nfbjGOCo>2*tQbvJ$6KCL`OTDSzcGb(xdfbULx-6jZv;)IWW+$--C{0+oNqxf1P zd!2MVQ%)u7S0I{8=VoxHgrz4e=Z|F} zY;u9`p-lrhau$(g53VJ7ooM}VzK_t%#!~Q{`8lzfa9yMH56u}N$+Vp!I@&sqWvpP^ z*L4vMLIEL$D`BBqLuNqX+%{@3c}%T|t*2oCY= zwi-McD@8TjtbIX{ME5ZoIUw)_#SNNJgp?(9fj}cwjMOjKJFDC7AKk%9&VH1C-S@x_*lCEB2aZ!#JoB(>-o_krFViDG(4Y}(#V4>umsv%YQ(!g~-dPXf4+ zx`=BeG{eP!_Gg1svMu2T_MkYR!HBUN2`t%0Toiy_{{ooj_UPBy8)4@5vaH75L& zrr916r8S3(`GBRpic-?`Y^&PB_I2kopH)WN78l{cL-4)1IH%`}R9B@j4FXzr1t!)PTO!0y6Ap99|UCinbibC_lql(VRG?ns4K245`#S#Xbs`GPnK z5ckS7=*I%(@2X1{W2p;KpSDf2Gy1PpE3MpH@8ZP>q#k?e-3%-lG%#?cqN%ucUAy&Wt=p-^9s)+oy)K914DMIBI8yjoj3pUToTuUC z0!=uU8C+J!p*2F>Kh)E9UKA1IwT_}iqJWCBO87+2D3IG$z)kGW*+W{o&NG%5V9>b} zTUxEH>o?1l@>CxAW1_`ta%4*t)wD~YVlQjfqTislPE1&IVBO|rD3GUWEnBJ%@j+zM zTuM3oa!kG$W_`Ae_3gPdcpkAtfY_)lF=2`NQy3}%EJ@{~OQ)E^yZy741=54BGHKw- zv-wFB(w{j-GmPnV8-fw`e2y@pTE68dmiolmpM@C2PSz~vBE2yuHF(}l@?^Ut4~+CG z!B|HV2TXE03Y{NqENbK1P8>}%g%8FTk3H0vuThz`15ju{x&TdTb3c=D#EJ2V1XN7? zm1vp+n6*@p9XYD=`^9QCmt>$x1oAq$LMvW&=&P~YE%l@TYF%D_6Lb<;sKUf_s>;W zGK^Y+CKkCG;f63AwwDq~y5^smB~cMVa6)Ha5~av6`R5-qjf-)R0^%=7(Av{ivz0n* zfS4aMTF;{IwXLuAVaz_xi&&P{D7CU@CT-|^tvwJgT%p_nyFrxYduix1bSV_<`5&aXW7>4F`= zbfIgsEb$XIlJl_0e1+pA0%00{PUSBe5R>xv4#*zMqzn!D4<-Emd z6S0;jY8|EFjvP*6=yV{2B?>@CBuq%)O;-Z$7eFZ@4Qx1Bt> zH{Mirh*V1b#~t^-IYalqSCK84EPer%*Gl%b@BcmjH{x&o`=2rz^-2x+qBr29!sTMz zNo=F-&M#M90X(2gGqDsZsjs1dZ`TtB^vnMXG04UkuDxkkC_&ma4F(Q}KB!>$;K1WHV2^1ze$ndn_!!zs+A8 zJ?eFkFa^c=FL>-2XLl z!*!v@s1r!bl`KSeK5+X6seUWR3ex1Y9DD2rIkVFfERrGP@co9S$ph1pBidar$RPcAR%1Z$&b)AQ4EOUQn2qCIh9Oh&Km_xrQJmFV0&^rX#-+m&@EYI-V&$h>U7V@QU*A@||u_o(aoYNiNA_vnh=l)vgZB6%7^+?zK^4 zd@c@gQ}~Jz7yJ&r4O5wsI8qdvy(mqa?4u2z?kjBYRNZ z9xC_59gs}D`W(x0H)Jx9op+w|^3f8Fc1`v}SMxsx1#pWZjI%n`@;`lI^-S9@qA4UL zZRToB;JJDy@#-h7@3DGUONdQFN|mSz`Cc2SeA@Xv*c0wSD|EP?@7#MZ@ySh`cpBJ* z)ee3Af@Z30JR$_`9az#Lnrs$-)cfh5!7m&Iv2C5+eW58RwBQx7m>eBVE<(pQ#-f2! z!NiNNNc>lu(F$!f3q;nbI;g}wvq;I(I(qwaPyGeVZp-J8AMKL!xJ#yOJ&E%Eplo$- zq1c;Sx3v6Wq!u6{BSU8St61O5Pc9AwStd5IAKo{6)2K%f!rGoxbd}@nWaHfwVbi7T4?_EZ{2ycuzk8Ji3hm{O)ju#(s)d2vi)&kRsa)DV-yQ)|vJvpO*xx?5jpmiuM4FJRuRk637Gwa0u{nMC>khOl_RE5gP_* zEx$`JD0c z!98&_%k5{^*0eyq6_X+ulXz@2Nz1Aj)F}E*vro^4=zcMep)BpsQF(Hpc{TCdU6gvn zQ$5=LoB)KVLNt zW29ljOD(N=?!cfOJ-3bGjzx6uo`>ne_Tg+bL%!gnA40_K-gp3{6{2u=$&7hid>{a~u0gcPc zx^p2*spI?g_g#InItsMY^Ue>u`_2wa7L8MWaU5oU{>ULnWj2g~tNe5G_g8%B-yFJP zQUW|FC|c-1_Wso62;>$L3%RbxRoGTdF!X@1XC6(>3*c@j!xp#-JO&}V&A=Q%0xIu! z4gkUm1TY%}g$hB?f)Q{oU_sfxUh?V_pzO&hu*UQ~JBr9I$;&$uAe6s#RBQ zQ6r6>4eM)9V_rV{+EwkuQIOsX$3g1Oet;fHDIS3(j*|*&qFaY?`Y_b_7f2SRiqkhs ziNcKC?Ch<&seT8;Bt0A`2bW>SxpZ#6`%$O(p2%SVV4eti7#^mlO_t02!Vg<3d2a=k z&zLyiF0x?e@-(L7+jv?eBD3j_=L+Lwt}zun7M`No0Ki9AVACUqS@H!cvD;T)CfB zkRfTeMvAr}4wcXbn6-9)gh8uO_Nu&^N(%420wHn34YfphE|U=Rs0Rv99_#E+C16P}BS%romtlJ;l$5z=6@g%suvx~} zv^u><`=)^A_iGQu6?F1M^z7p7HxAHfOO6|dv1eEGe*)i|Bs4Y&u$0NGHAe;a6S{^xF*ywgE47|13IINtC;dHsX?r^uN%}oUYxD(P2dlVg`pXPo z!CXWKusxq^c~Hpsu}iv;12S{abBXp-bSKa(5{| ziOifo??IzI#B~gQiJ~!1r2Xs_;88eQh4ej_;*BB&mpgk!rPLg8zzVxdX^FFbu>PUs zRB5G@kCO`p7!&Pn>H#>*m7b$pZKv*|Ei}VkJ&hN5L(0zSDB36z-yiy{@Fj%i`pfSp zYxx=3Fq(J2fb_CYpRNssu(>zi%NT$)4=F(OH{C1~kq96Tw_P6pQ1oE&TfwQF#LDbTJK&LuTv}`& zX=Fnh?1K^gaEQApxkRi?{nQ1ZcLAWY$x2lj=gpPPkq=MKBQu2nVCb|YQSZ?7GIJ8} zA{%ke6k)l!-zg!|D9sj~G!lTE zGd3h@Mj@19){iy?-5L;P3Y58@`KVG6fs(X!lt z@uOqsU<**0=_x~S@2Lywy(FHMjyt%ltuW(nAR1#i(DzVlo6?qy77O>W zGPEg{kpqtvTSg{r`Rl~_CL-tW6HzjhHhp)Fw%tt@T4pW<*u%&gL}io{$M-(8>vX5S zO9rb4j4XHSL_GA&sH7}w_Ph9z1+Q47TyFGHkQz02TQ=;@h3NZsbe6ZmXv-E8d;xd0 zpFR_llnLnRpVk0u4z^0Ir|k_P?B8x<$)X*Dq#k${bz1T>jAk|rU7}3 zLaPeAD78D?HWKEuPgH`@39W2F2EIj~0q{->VY8|;Ykbw7iKu0Ozd;OZ`#N|ir13b^ z2y&>3uW6ZoeEO;TT?kEqe9J!dON7CfVU&280+@ObaC#7S>l7e46oepo%9FKDTmo{6 zbQElC6U#feh{~kZbl|-L;E^{WK!a(5MZh@C34{u)?19ASwO5qu3PSJDUOl%}5%+PXJ!z17=&20ZIvx6I4>fc&R1Qhkh>iQ2Ie! zi7!4ZM@D`4FX}O4UZT(Wpi>`N;G!%u(fqJUe>UTM} zM6}im*o?x?ND-HRDB)G2GrXJXknuBajQWcb-DO)nWd%;rZ`0De<#zu{_j)5YjRtn!KpBAaqJDj1(_?Un&GkFk*l#Re>Y72# zs^Rgb@BV~8wwN}VtXX2Y-Wz95Zm1d3Nn@VIN z@%mBdDKeLwC=4AZo<`?s@5e_+4zGaOM&+uV@Nf?+{JUY%WG8QkCJmC+^B3VoR*#$g zt%FMTR=&p2pMGheYl^6qLD0&h5|-lW)g@D4SB8|n$6z5hUVkjFcC(N7ebC9#SKO6j zDgV%e2ide@X3-~Fk7y$;=;uX^J zezZoAws8u^9TdWSYee_l++rmUf6ww;UM)2)s*p~Ep{a>ANVs8xk@YlXQ{bWvcTGBa zp|hYc9Ga6KYpOk4^Xk@CFQWmQqA{$j*6si{%dh@axfcV$eti^+kyXn-R0xnSx_nsM zscl*BTj(NQh5*sB(0B5Yv1N(DsTY_ z%A^C;OK#+&UiWpmfLs_n0>M_+O=gAY2{bmsYGS%cttZd1&zT>3p@Vo0GV$jT3`+ab zYH0uv;945EJbIH+5ts-l`xK|ySC)#WX!dvT^Q)3Uub+xQirbST%eQatC%`Hz=?16+ z0=|km+Ebzg7_N&1A}F`kS4c_G<+>4Ko-oyub}HXG(1}Y`3sD-?s|z;b)eG+1XuIF0 z89_=4Kh7*}?R$iYmFsDkm<|v$n22jDQeQ}4ggo{nDM9Bi?!w#5Rq?9fuOcS<_ujC` zMSH{Dm0EgWAJLU;fv0I~Or;Yg|4KF~FgqQJgXZ zD`&Fs+1^r+iqZ|^~QF%jn`t&YsC@|vt*3~lo_qV(j7t-65!Gtor zo~9RIVaZ1uudk?nx_AC+G3c#N2GO528f4zYUyJ3QUc-!_EC%s9@1ST%#9Ihdgr*!I zNP7=uwq?JA2u`p(fJDg0EKqKarPcr9W2_8Ce?#y>wA60@;{nX$a z8}c4s*cHn;ub=iBzAs-%tUqJ_k)<>{%B^E!bNN_FKg}+-3v)uD`usBgAc^iTz@Hv} z@Xj9;p5`vFbNmIVOFaOz#~-NOk^v0jU;Mzp^=9k(m+T~H=>GZ)o@*-l`diIGPR<{o zzxUtXiOHZrV8Mf@W^P{QE6{mkko41~D}m4Hj*GT~88hUYXl>ObfXk=B_1B*XD1S<6 z;;onxfsShU0O7d@l6rW0Jg|2~lK1ogBcPrKyhVz1n1$dW=KyNI@PHLqe-3j$Jvaz6 z(5j)}0d%X-gT5IssUC9Ze<97GIv#Y)tv%q+|Nr=hP}%~Zipp|W%?b``^gYXy$pXnl z^I)b|<|j40 z<-J%2l~5B4fY;yvfA>`a(b((Rp8ya|wd>mjIL%9;gABPX_v<(D#22USfJV@@8#KPSd3s;0c+O}VtbPJK_E@YJ_>SNU!?cQZzMvls4W!?CvdJR5H&h>F9#%6e^h zp0r;am#gSalqpOE72C}A^to2mu}*O?>yE@zM8|#H>~`zfgjp!nm#4icr`Mn8Q<%Uj zwn^wY>MH6b(ET3&aiz)xtET)gw){kY;+EZ6ds6ROcY*VB3+D&rHc7cQZcj$FRXc4o zwYEqozcsbNG_mg2JaakT1}YyRGq44Lrs)aH!iqkj59zSI2akcb9zd7w6!R{fG{FH? zwo^fYL{AG;AFP8&r#oTy>ny{c<;tt|icAA^$|!{Xtvo+*F0+53CbUCG>@BAiio*vH zxkY7c8d_rmhuEIdW*^^hL&~5@f&FHq_*@(2CqbOMW(Jx?d}d?|H`UADuufTb^HmV) zPmC()JbzN9qEXer`Wypvq}p7tVSnPLsIgq){uKOxgh6f@#+RvSGj2T@`^&lEeq46W zQ-&pl-qZV9u7Q=}+wXF1P@l-6j3ltHbbCD+Aw2(9U!F02#I?QeaS41LrvPWVE99~Y zOY_b+O~7RN5fn>QJ*=T5BMenQ-MCP!g;9(Q5e5+K=@GzEe*;jlpW+PYrAEACRqM#s zR3tPJdx69T0(r7D&lr>wVc!}-mPIK9vO&-W8*r6!5#A@%1muGUflF@pk_qTB$QAmE z=_ydK8pHe6rO56#o$u{V4nu+=#Ynq`%GUyR# z0qU}pvuy=bEf5yf0`_J28xq3yU>`9Lz_TbbNJ7cBe*glHekkqy_`LWV(yl_r{Vc5y zkBz{l2n$E-_?ev{t))e(qpSdsv>Bf*ltNHxS>v!T_*x90!ZAEP95Zz8c+d)ZXA+fM z0GKG~4&OL+)o-XDkboZ$Addga7r$VJ5Wjg4m()kG4*DAk2$|`iyuReS1>>m z_^LuAD}xVeK`hruh?*~mQksV}!#1&>MxU}8q&UiYJK#)T)om1$8Gy}6%?lM3l{7bh zpobK(<56z__+{G>__4KcQU!9hYOX*#KD-SjvL&UJ0C10rMp~S{aZb);z^enrXDe67 zXD2^8Le1Q2I|va4+F~5x?HUt-lb^9zvIW=zb1%p3&JB)5!6Y-81vJpL&bui>Ta`n` zpA$e(R^pHDMAO^KAo>fe4?o!Sv^Ea*s5@wgjm?+K3Ynz}trc?%wf(I@GLlie)?*AX zKUx&`U=;%PCm8A~Iq<&K>HS4_1CqZmb&7TW{c`|v1&Cjr^b0bcfDIEiz_cw~L>$!|Nj_&b52S;FKL2?&IVkJ{E_hjQfq3(4?#bJ@zk#gy zXJr1x6QCnKP7;&_ZEWRMfkJl`G}E>~WTi{;KP zfY`D>t%=5Ad+qt;KB)X7X_4k%Y6}AuWzMQhI7jIhq_WEcoUP_@X#r#4#0tlk_~k5u zbP1YwB1}3^=zXPUk@mw#ScelFUq7=joJ>AjnVcpwp@4-96XKdV_*D&9Jaf&Tff~3n z4Iit%a1aY1JR4GFM(2QI(Y`4gDKTss=bsj)4Z5t^uG=uCSY!Oq|aNDPoJ6{!9KN18I_(#~HYxhM++-k*dV%Na&1~6&|`+!yZmrBSI4dX!* z=?qMqm`82q(fVXuXrToLxB)cpb=A0P$ZVcy;aZGneC6DEc~mtE8dS<_h?C1|8wCe(ceoZ%M}QihgboEXWIDT5-0wSgPxtJXAQeC}X&oj*%1IHF zk<|sbuHxG@i%v@jUET2;17RjxS&;w*_dE~hiHMCX;DhxagvnR|ZxzGc*BFHmreA&M z=ISFB+&MlmZCHRGb-9_g?)(Arp+o_I+)WM1Hb(}|m)zNgw@iX|ztFNkxmzun z4;5mU=#i<>*I20>_hFLK4P-v5#A$PZ$j#&pnd@NBt8Eg&yTE(9RZH5x5RhTDzE-;3 zxEa$pQj=q`PTSXOBnq^d%CQX70FfTFH{r(e`um_w=iuPm2HcZpl4$ZPYV&bC4-xTK z4wLfUbha2P(JuVXp^oOagIN6?!mx{K#eipGxvvo^g$VwlP4qKCC!%NDhdM|ClDR~HH#FeU;TkA3rThcCf z+QUilr%zp50zI_GcOi<$uYj**8{1M+5^%TOm zU8?{Tjf=Z9eT3IMcaC`a3#jWA{8%YZ^Jsv4_gjlS0dAMrW|+VcwF^DiCP}@#-uc4S z=HUxI_(lN0ks))y9CG3B(MSVViL1xLNXM}iYo)!GQ6;qUUWx^r#gL^Sk*xV*-Xl5l zu9%NU@iZvBU0_nVw5Qd_lp3G+Fj$t`=T@Zw(%6j5wYS?rC1)`zyaA7?Vc3uNxku9m z`Y*clm}F-ZFyh{Vli3>khd!r`hJ@*f##KOQz0?Bs{l*pC2-z?@AO0DlVPuY08W+rD$_WX z>=B>}w^8FWoP#pUT@-W5{JRYZfElxKZs~@yidszjcyy+5gSWC8?v~OSYGM>4^iU!J zIpVtkT7OKoLmbBMD1=D2EJpLsxyBHls#OT&_qJk$VU-V3)t47I?NPM#Ue*eslk3hz zzH4{%5@q9-q(H1ft87xT3rJT`(okTHI$_7`9aE{Sai_yq3p}xxWY`{SjmPQ zCiMlY5R0RM^C;o@f5h_&Ve}ru|jWb3=5Pf!X0QH@B=jKLZLT0 zdt!1X(sC|=;zbT+LtqC$JG0S1@-r1T&h1e4f>q*ejF`oFN(7ub+UM!hP-4v<9BCL{ z60LP>Hbsv@j>>sx0J@yUfy4IaW3X)3XFu{C(R$K$Br{xwv+v9}2b;Qa<59B$TvC&o zQ1hkqLqj|jI08>6=d-fuubIb|%2tJ$v>#*p;Vm`d+5GU9@pe&~hWAl2G;2JxsVk|@ zsv=`xi26Ec@8t4w%O=$aGi5crLm^}(c!TuqGW>}F%6IW_EAY^G)1xK`J`UH_Ko+bL z^vBd@3tTJ#33LMK+JP}3>kZU#VE$%_$Oesq+DmVHV)os$dLb4VB`8?55Ns0hC+fTZkJN6WZ9pWuceN) zWB+n`>1a1D#-@0)C{n%^>s23#shL#wMfqAd|Lf;~v;A7T^Myo?Wcl?ezScfxf5-?D z=?+4&Ne#u-Y#!sa+sY-bEkX|^u~+U}wN2eeB1@Ry5vzog->Eb3)I|FBhv9`$^Akg3 zn8r7!-zd3nyo-)4(7ZnMRSwu-G35*9M!GBtdG8y z*mFjsxRAhLuKPb^5?)hS^kCbd4aDewjPp(T5l6PL;5Kg%w3w}bcNYFubD$5e7Q z{R)CH_o$nQ5rt)!3Sc^u`dXViL$Cqj@6X4eQAGOp*dV22_Pzgd0RNx=16*<>PbKC1 z`$vpG>z88G?u=L&h+BCBw1?ba>C?PR@xi*|Gr))9LwEneAbwF3XTPzGpIA#uV+8;O z{WFm$kNW@=VR|H0C;7LcauGU->t7K63vlE{tfAbK5q%@XPBH=OAMhk1^SQZ6p>^AE zY0vh1UA8}e&4%fk^W_#kR=bnBr~mgC(4CZlzg$Gy{H>I>ibSdI|V)d;1oZ-ykoDczpq?>pSM0g ziUxWcqC8jMb+(P0apjXpWEAv2xuTBi6uelu{rCQQeZD}~FN5FNTC8jF+E@B*ih`HJ zeVxGkVTQ`}2$)IM3*R!(?!OM6Ub9A?%>R=){U(3Lb+$iVkf&us_!?l--?CVC%=^=J z=1m?z+qU+uEwFquTgSp^ALIX6U=@?y+;0m!aJ7aYHd*bn`^cB+|GZS?K1?2a9jeVI zUG(jX@1VtX*B-dSwcb&&9fC1HsbF_4ZG7`5w?2;SI`G8wPYaqQa3ccXjr31CMJ;fZ z#&-SRe54unZv}??KHPY! z+Z0AgM&9?r!{M`Us8NEr;Jx#bz&?F!H@oVu^WFJD!&8y^2di@qyC$E^w(5riHp;~W z={7oC#O<$o+lSdX?qEgva&k{!jUpVn7Ek(p?W!f`8;er=Q~5tV=Mg^qA>MA0AAD&lCxpU zL0rEQF+cx4N_J%Ghqm|8kh;;YPKCoNlkg-+d?QX5H`g~7G&a@0>973Kv3OvSA$aFB z5(iQ^F<5l?Onci1WGT(~;_8*W5Ea7YB@r~&AbDVH+}Ccp7`uDg8cer%$>GDtvi$_{ zJJ0W{6_$6jl<*8I*at{_#aER-rw2@Y#|T3w{w>b#*2^Qk z=&Se_R{G%0TRcsUBNB>0>AKJR`_e7M;!#*z`93AvVj`&*bD2zE(yweZ&#PO~T%)p5 zSCMdE`z#66<9tqu|4l+j*_mqTe9Q5{j^7221-aba;NyvhvTdjHYTk07bKph>+1+BV zIzIhqTRnc>jAfa+Ip-ub2{ye$m&-Vw3P5BJB4wOgnij3(>_mmzemQc^tJIpj1!q!j zz)Q-~x8lfVU#az?C8#$wv&m_!R;e}A;HjnTm_B@Zv7e%!@afx*G=uDbNr!Rn1n#G7 z9K`Lx(X}#jFfOUVUinkF0=Y?(?(HbJOqcBU3OFZ;EybHYllX^jwOT*+ax0l>BB!8mC!aVl=mj(y zrI$NAeEai+vvSeK&TGaXbsc?$4r7DyR)ws{L?huTm=ik9w$*opLYcwN3vs^Dt)P|X z8(yxX>BIIC3`mw8?iFWQB{h_X`4MZ#gN=h;BcryaB*}-QuFWUrWX?Y=E&AGc*B(s@ zsnqhY=4GB{j*TbQ1xJhRUp5cOqcEBT^aCMEAM%<=kuf^UW_^+U>XW9ejv}g63$dY~ zN~)OhCrw4RO6?V&Up_nM_B%Dpe!D>n^X;Zu2z5L^;x{qA;(WaJ0$(rOs!xt;f9jsG z@+?jJL~s?_B;mYP8U|COh2Fk!=8pK9Xw?s2!rs5<%gVUSW{+ortD`!!r# zbk3=S=PzX{F)>)|d#FMj@`lS%PpnjPO{7$?ST30;NUax%@!pS|zp=h&HiajL$<;3R2%rmb)J=QIy2 z`Co|9GND&`@1qmlmK=ep;A+AxVG-zx5~iF6K84Id9;I9+{_DY)i_Wx3f=_x5>1O%^ zva5|jgOI#~D483i5A|YSgun)({JvrnHQm66qqDRJU)}MP42-6CGBc~B@qP7uLD(dD zV`_m6VbIq_GVh(d*HQD#XZ!@VRp~B`+iAu97nKr6#?D;8NzRvgY{4CT2aU+<{8+A= z${#akDN2%hogw-&-IM%wuvuBMm9)mYybcH=a%}5{=PUW$GIe=rbc!Fybr=p@p zqhZZ$v_PRe*o!zmKE*weP@B=pGysJd2{!8jj$BfeG^e(345h4oFS9JV_9WTirP*A7 z&>_t_3PKTI(_mBGFx9J-IRp2Tm2ZQmq>T6c_k3rMf{tMEpHZgN1wj^4!uq^88zaA? z8g0WfxpUZIo#>3&hLug?*j&As{yv~#7zHjK7hc>5+CUN}R|Oa3qDC?dj4`iITAE#onz?oK7s+)Iuunvj- zS$f&&3*>{pCHMzM-z4X6q$@})`T6@RZpSO>;~UKCY$9`az6P3v!{pNxc%b-a@)q~O zw;r)koJh3Dn?uo=&0X&(bB9h&WCXI|!-JUgy#mRo$% znN@B4?wrO6W+e9X^?-PW=}OsiPQxBU4#RFLlX&0NSs&+sQ2GMSc0ydP^$3Z*0a1-B zlHddJtzWYJlU&xUCgtK*A(uV|Y6DT{c`)1DeJ(LlIRmv5%X_2yW6l1dLW3O#f`?KC zcmu3)eS*FJIltlFbRKM&-&`oAb32aQw^)-mM<9_a(b7j_-hbQ8SH?%{EHX4QE$_WZ z!cpK+AIY=2;yfSF3?3b9fwjiW1fh)ROo%?}@6adk zZ<>)w@icmc97Pu{FmE`e8W)0|Db5&n7q*+ZwGT9tRGgAQ2paA|wSZnMmO(PyPivuw z+;=pEgC=F*x=)hq^$t|j!hOs|jth*%Bm-&sEG^a((n}KkLn{RXKJ@BW$S`WZf}MLX zUf?ujDt6Rk0B~ZO2UkIjxf2e%I#pt+f{ot%0ZbbHup-WD?twg+m9bSlIc4F-MI}*MDIJvC5pqG z*HHuS~d1yw1F-4Tt*RyfYD{eaq$iUcGR?@8D zWFGULK(k^p94g3f;6UwfO2p_6l`!D2(MaV2gnqA$#i03@wA9M@nbKiT;?6%juc&9SDL`{Rp5R|=0Zap-KY&N7Ds^tN-fd8 zDS~g8V=PSZ1_+sRn5E%z@IX*ea>oVnQ=u86_-Br= zM*In*EaY{0mzb(j8SFisN2pq2iBHu7nhc%H!2si6m8Yxxy^Zb`rj-yxMNk;_;nd9IWTNy7PwZLQ7zt zt0diOlbF~U>2Ctvc$agzzWGWz*D zY_8ubno;OavI0qa*xn74*mUl~b1ew%F;1l{&kf;HI>(1qq+% zq&n@tp2o~(%B0p*R~lHDnu*27?;uP!=!%1%(P<>gZim|aMc*^=KYgmE(!1B2^@i65 zL7QqexOcyo=Y>22tM0$Wjh3HmC>=qlYL}_m^S|F_UYWVa>|WkZ`ri2DaaC+r_m96f z0zH;cHDoII@B9A;#ZmozWbBfaVGuwxal3T!!*e&KKQWc`DxgZ-iun{he^eu zcDPa0$uEER$q*8NBphj*i}>{y+z(Ix1@;rS2Y;%_hM zhb#aF06fMMj+O~x&ebKwf`ZOYUU{le5llG5O><~fzy7Mj?YhwYt8J@Uzk2m;g97P< z#*b#@?c?Vk-_{AufPdyDYEj4#ysm|%G`VSJ{%MTl_&*d={7+OG0IvUAKoZ-vceh_Y zJsupvT}gR4%Z=iZED%~w208UJF4@*pSh16J`YjS+}JqV7E?MmIX`AI6c zq{9q&f876~ya*`3>X%KAdGyz3{v)i&z5(4m13=CLe+M<>fuUYQIeDP+W&zJWfJgu? z4wHHS)IAWk1QKj-LN`_R+TT(Q8q8}PNkt9`i0XPcM6fK5%(b?p_giz75lKne7=-}Z77?O^%s5X8D2gFgdqyuB5Ri=I zj;*+B%S2fZGsMb%82X)kDz+7jT~47|O#3tTR$N6DAhH^HK3 z$vTdlk*NklYtJnXSbD#Wrnx8&Yhh+{I!FV!7DPZ)c~07a)w^;3b#g0}OlulNQK>_~+;=pdgM%8(Zg`-j#w-Qy`K6BWLUEK4)8C}RQ(NwS8g4yqx2Q*dZdTO zDHG24ShDUnj75g0;gL_p~V z1p#Sc0EZeH6bb1bk&u>FT0mO5k?xKmo@@NZ{d?ZE-uGVjdhh+mEMc^ZXpg zp<8LJRhiEt&cdZX_F*$&4AK+lW+~2Ysa#W<^1hVaVC-(<=KE(!T$SlO`n~q;RF0PG z9G3gFrG%455kg6WA2gSTdzNf0#snXYZ4?!hNTiOP7uBd@>R4 z4aZm90IWnr<}8pMb=A+cX-)tv;Ezyi3)s=n6v}4PWUrLeyKhfd+oyT&J@Np3d*>N2 z!WKQI!SIr<1LQ;Xrh&gXsSx;GAAJSWPOjLf-mD%3bEe2>MEhtXz+B)maB8L8r2|$| zx_75m0Y`z)k?bg&U>+$t?|PD|r6L3JelB=6fUInF$zv0wWfk-YXFiEj(^Jy;F4%R4 z{1O1Sxk>se8ub7;ngf0ONc%@0vqk%*&v)eNqVBk3%7MY0D#O&f1h&$bF$d;H#Bymom9h7(yvpKlItJ^$yX6@IKIbPM zL~F$1a^r12JZL;PT#S%OLc{<}Jf{i3cqNHXH!I_Iu6BWb%-u?^a3)DHZC%~vtq&*x z(C*7j-{{(oo}phBl99hSwa2uSpq2u^&`H63RZ^wULXg0>(`k=|h#k<_r<9LImztmp zGWQaZZv?nr-U-wG@z*{mdf9aXy422#k^2x!>kiFwH{DLH<``5ZZsOplbS!E30Iv~`9-4~_wK+?vJO{9ZnRo+`>{Tz2 zvO%u^)KkV;P&NMI^B15x3PSU5NA%0A0~qBOKz2v!SnA{oWoE`Ph5 z$Sz!6ZeKP1fguQ*N>r1CO@TpDSQ@DE@Nw8YD5?i--u7~p@i}mSjkIs8_;ej+0qd?u zDY4miiWb0Dgx10sfvoHgY~zcXm}vsdZpQ71H+`4gRmw7>8pJ({wQN2#=GoOnfF=_l z>#U|SaQvRksf_&#w4Rx}JTbn9VVuGD7nM&J#vU@i_8smyU~4pdNb|Z8_z_y3Pw`bv zzIHO`QyTGEX}`vLcba;7~%(2#ajxlJCfs zC2BtdQcCMBaUdu`tOzx~&7Br$Ixu})f=%g~$K*3d1iihe;~O9dZ)GVBKwfL1nGZvR zejojofkQrRq|KlAGc9U`29~<*vo9mg1}A8_hB=g(gUi)M2O~f$K*MKg*c8GM=Cl}Q zsQtaSyU-zQO%qx1v~*b!I&ev#>+NJRAE88p(mN;;fWH8cPk0Ou%fMsNM)T`2R(xz) zC-o)1S0FI>*llyXb%v)K@U#<5MwPo_Kd*V3r9??iqB@e9zjZjV8=Y|H^sZIw4wxD>4Z zv9|`+f{2-!io?<|7<{HJ@@5U*4OjrmfgfNmRQh(LDH6%`vOBkvVS_M7m(3c0mLL?h z66>6k#dSh6qo&42qc}P|N2cL${H2vo%W}Uz!bJn+8TJQ*R~E z4F$#uF90*Y(9C8bfP^j@5zu58#k>XrB(w1x<#+?yiyl3zZi)SFq3X?3tX}|=eaEBN z-GAf8ag!7@DqR4cZ$D94{aVL9d$M@o5muifToO1giOGi+2FJ5($d&isz<`7Bbf*YG4Z8-9P3wVQ;=> znGtvm9G*+M#UrJ;QgEM>LxHLTN)iakY?fdQ2>yw|x*3L)7YpKFVg{P!&%I*|Vga}? zs7naqsrj&eKM^Q%(kq>Tl;JO@5q`2)U8ncf@v zN*Xy_`2`j~96sVEF$>*jM9UKb5(IYoC_vK`a1e24z;|g$C1$DFM|f_M<>?RB8=umf zWZF(Gdq~unzMNo(aj+Di#jalo6(cxde{sRdFY`^7 zmT77Ek3M|=fDAaKjXX-xW2ujMxpSP6kgPt``5vD;T?#?Ho$ZC@BYdH0>LF~fh}Vse zZ`pTAa15L(ISVP_SP*z16ky=$yCe;K>twg2#(-$Z&xqaC!AUx})9);Gux@D+gdgvqKRbkHNw1LymvUnk9?W=n}BX=7Qpf z5qQkWxrEse-?XKHCLx{!&<2`&&Ps8BAv>jW&Ehd3T??mA=ho{eY$0*H;ydf_8-8IW`&)S=fv!M5VgM$?+lLtgpt zY!nAYtehai!cU7>-llh8qF@}6oy6i6rwtR7A5koi@<0iZ*c^;xqjO%!W7XMkds+LxpJ=6;_@(Efcz4NYqlBfs0KahpejwBWS%Un}N6 z^NI5#fO3NIj;7z<_i*kuqhM;e_I@YiUP+S?K9^r|)QA3WJ=@D5vpwJ`zF8nPA&J!; zgSj~dpA5`ofyyy_vv1yPl{xc$CgNca`KyAlDJ4WI2`?__lg)eVeQP_LT~+xp`)Q9< zdF}Zk&8s~bG#FmY;HV6%Z(p|-DXe$f=G9-@W@bCfqrqWtU zp;~+6D)Tc1f|C!Fk_KrMs#x{gFV7NQ=6oUM3N#S1S8Pd8=TR}O8)LX@vf&wXJ-PWv z=Aa}R%RG67dgplLq`PFO_m~jHW~UT<;5!=cW@hZ`d*}W3dYS{X5mle??aY%AtedVo zzFiU2%naLCTe>w-!87bAub7*-o?G!RPo4X3uL}7oQ92KNRi|Q_^S=G+g89f&B71?- zDf}T0{pWDQ@%Y^+{#Tqe6MFwufylCfKSr4OmqnhKTS_zvnkawYz9zuZc@xTSWfm^& zrjlKp@sx)??5_)&zwwxNf!g=mI9BZkuQS8Q7O(HG=q13*H3kSCLGDuY+j0M2^^nI@ z162qOJc0WNt9L$+#?#eTQ*{38-O*B&|K0k>|F{GLlr_Tk;{j?ZVjx&Mv77e)S>qAY z`4#B}D4F+^!hcnMFo;zl+#x!NDXC-CpuBdw>!k#nrw-svY_R?(WiC~}kld`#Yeh@V z?y3(d|0{AY@jOF(o#OP(DT7iL|MIY{P^N3zdIvKZSI^`=zSiRGAQ5){gLXeqNRG^|D6PXbb4ZccPs?K82-)hJe6Jf0r~f+RVcRt`TmW^#?9y{;>XN$&((0NbbO9&I^tiA^va% zAua+MVFB;MuCH$vOJ)DrTWG~4>Fwkn#BU<(W1G|P^M$sUDa70u^}sF}jks- zsYC79OXCQlnL|cn2QSFW&~&lYt+#WNuPQXF;`lRPz)DfdRq^>Lhu_j2J0*Xv>tc0| z5my?39dXywTZv&Xh3SP6wCMHUsE?(BGA=iCC|Ek2@IiwHeTEE64vNH3CV|C`gc0;5 zW2G4elZ+U2wnty4p0e$=HuS>8xa%$t6C7vs*_wQ}^IgY9d%w+*%SAD_x0W}Q6J>9s zVVIq+Z{VP|J>>N=s`jL0e7`;>C2CX3kcUz`Z{`7~f}zpX8<)CNYHg{-DCg1ph{`$u zPOQXTgstqkX8shBHVav(bzx@e=^pgS?m0ZMH9-E@irP;VSxgmO`VplNR8{uI#mgPK zmAXfJ>}fc>l)T=`l$DiowKMA|tD`%}|4rps`#)Z%u2UT}*~t)Ng(v&cioekS62eJ}g1sh))=mrB*YQ=jac&%-ywlJ_H0 z4^jLTFH;%m?!q$NlAVmBd__v96Xg5)jfcW6k6^M%{SF3cUUP2_m>v1zSVI%Smpjga zq{ONwn9agwuGzOXB>UGGBurhmMNW3zlvD_emttnT+c7>DP*A@OE33SFYz_PFZrqD1 zlUk6+&@>wF>*UIMHRS(B;D+dQTO{p8liuvHi}GGAgf|`tDZ=@BR=e`1(5DG7XRRpo zHAtV)NjtrEX>t4=6#0=NMX`|>j+qfCHzMhLej_45kAIQ4z1;n(Ml<2Ku@_6f_<28k zLG)GA@2^*7iMXt-`6ICi@<@RgL98bYoRpCeTbt~6^~X~QD|ZC9U&`uSxtDG8dFyyd zWj?$;06O8<9DRqPUS0`%$kIr?(VC8Jj7%hXYmB3ckVQC$(}`EBiH#02M}^u4KZ$A< zyCeQ!hE&pa%4t?~1OZZ|+DqhZA{nJcQ`6i~pR8mlVuyqznX-&YEJnK$%X%)$N6;Kp znk6emUh?`AzFM5KwP2qGfIINSI^S8uy*EzueQ|rHr9}NSYa2zFNE5Mu+Viy8tX?$^ z;QEvxOxz1V53;Y;Wqf64@I!%-93!X_X?y)`eD>H*dP(B2yN%gCN+gHD zNnUib8Rk1~ZAX)Mm=Yp~MX=c|Abr5hj7k&6GP^6kx}VHmds?IYz7P*Y3oLuwT@l$ zQZes?W@b%`BHMPBSq1Bnth=$hubbQMEHAsa9jevxU7frDqILA~IkTIyGJXE%gG@Ll z&Wz_^1oxxOhiUh*x7!AX^7W@?=W7)%};KDITuq)ESfI355)5PuIKz_NmNc^_7e!?9Kt4Ao4 zO_EAZL$#CJB!-N^oF9ilZW8kPL}kiGolnKF#+O+_;5elcMAZzyMatZoLA6fg3+}k3 z`cQ7isi@9D*r9WVsCGEVS^zSW89DLA?)+4WgH_J3rs1?bVLvEcmf)Eo&F6X{Qz+x< zrH*hL$)a%XW#vF?wy;TMlTXZXBiHn9K>0lmgA+7YWrzaz8-(KyW1H25*2&~ z5v1L(jKVXOB8TKOs-Bx8AW-5oRgF{O+^ul&%+PRGDWSWC=S3=gtha6T15(xJEa^ws zGOZ?Z|8suKnLs=or}-J6Ut+~GIj8$u90i?ze`G^Cp7>QQiA+gF;Ddu`X-y878-gN$ zBCS51HFI|;b|r#V%|i~pCRU|{V{7#g3rjPk?_l+Hwi(2Ojt!}{{`~5krpH+d0*4^a z)SW7SA~rmN&j94uDs_lTjyYr;fxrmoOo_x5E>@+tk}CI(!Rqs9e;JGKai-p7Nu>iQ z=mrp*(Pfx@vc7Gx;+#g*Aq6cs-?c)pk*iw`4UPxll9F=xO0{9xw_ZRVlgwWN6NP>S z+w3?>$lGdDT<){+n|HCEzUeh+FHZd-6)x9EY>R|8*a$(RQ^2}7=JrH<33m8>ETM}y zzIW|iH`apr4>rGT6O|-x&3OOas7=&Vk+CG--Iu|73nS5q7PE(v0BbJCa8(UcTDsu?(PYY#-VjlKX5jRDNS{FQ{*?w)`=ln=p7^xdMEc$+Zo_c0+bDd`}{FYJq!&rh$0sQ<1kwD%rw5 zJqR2hcd(q0u53qT;olP=5YVie#x?zpIFM52HBTkfqa&x9@u5a&K!UXL(zJPiOH4b% z3$4-(Uh&n}cIcskh;bJYSRzj}-v*K4i7u_uOGaQ-7E>fIMvZzVy8j9kmpWN-@0`yb z%W(c)CkJo~J}LApKI-ncgb>Q7-`sye)P{O1rIn+gAOXG>!{rPcXvAx)%RP{wu{X52 zOA8N0=FJ4r+=KK=6l*iQzDr&l0cbTRrKkfG%d98%!+b*lEXqdOwK?h`lZ^*D8YN0M z&HIX5l*+?mr7~>}tk0?zPcBYp_R9Yb&*}^ZjL!6skY*Z1NjU)y;qpkb*3#tcxA93f zDi=SX;d#0wGJLthr*?;^7B64@EWEI6_S$^Fg}uCpRw~`c1)!hdvVgK8>hz`@N1xfe z9g=PEk~!?FWi$W%J+93PTbN;b3730K{RQnvE-$GPt|(YYlZcC;;&c=(B0Iw6mq|W$ zb3D%CyF39kgG@$*5>J+T0<*Ol{0%v6i9C^t-!n0z60BdF#Y2^!i}Psh-{NNQ+TA6+ zXi8AZ3b6uDNrHx+sMQKkM_w+9??+COx!&>14l_!tDxNE$*L^UpWc0g+W9>F{l z20?x$KI?3|ZXY^oNw5P06%U?OBU}}>$9{=c?(U2k6h-$s`1If|4sye?!0fG z*p&5k$HNGe3o%!S4RsMs1~?;wGQ%|3Lm?+3;UY=d^mE#5nce}R&T|JbJ^{@k3)VMr zNhq%seHg|A@;ABPvkYa@^B8aE*nr8G6Jg7BR}QkaP_Id4XRHsfc8nz&D1$R1y^mQj zw;u5rLrqYVoZ-dzDXO8u{;`-&&7Des%1|E8CLiFi#<^u)AbBzq-M)La_gnb}P{6~G zj5A^Ez}08u_vDjT(oOE2w8jU<8&M4vFeUL5dGnw>L48->1OJ-_pX1h^D97MNl zU)^aiwLH2pa|ObSjVBwaA52nDAc9JYE126InUd`vvYf1YvWd0T>rXf_ZpGKA^HPSn zm3O4kjvV>Y^GpR-vKGFRCmxNenSg2fl0UJU4CbYn2S^O+)UE1hcYowu3Xnaone-1@ zlCGK?fim#9jfzKEtn^uwkHxDeHNfM$iJ8HdhmR`Aalj? z^bAjRp)3d!V&~Zbh9ZOJk0#tzR@&0}=MAFTrL>m+teze}(je6|k-tBXi7y{(WyQu= zB>cf%OGTQ^KH^@Hi}W)mRd{~v-y#G{1ui^ptq78*w)jTuv|KvbJ27-igI_F4JNf@v z(B%ACjph6^@&CV9WB-$dTo&x`o7|fCeKa$3y+HZr)~e8-@UF~kTi@kxo9m)~?qSy{KT;nkYK9Oc-YW5cJrG|5Xl7+`Bl2TaWOeP# z{Ho9+^m;yXV`Yf9d#s_$&y7r{GASL$)v zx=$TDyRHmWRj3TpbOos>RsFCC+(7C#F{}zH{;3Zj9^cNU|MTg<%{`*Ob~m`^f5{XD z{k73AOpY!hm3M9&boYD!6_;m53$#DVv;cF&hP!$LV)gp}rQYH!c+-Bo94|%875L}~ zCJLoSNU-He@C1Ta#6-U$RK|7Ma~T`Fh3ni{htTWJzg`otIR8B}^FOW`{r@jp~- z_5dhIXL(e%Z#b|`NmQ#FMGMS9`hD5L@2}^{dIQS?r;RTTav&m0^auqUR|Yc~!UJR2 z6x(_yZysx}{TGMS=0dIw%o;2b06NkPtRrHm0zt1?oWZ3}W_1aU4yc*~@m?yx&;`u^ zpmpX0KYuQL&^5vXyXKE-b^|KN_X;($x@4Pa zSnFmA`WMO%mPJ5VoS88&Whq6QRH%oo^8-uUuiHSTo=CN5a%a-_>SA{v63jrh{b}fr ztS`42-6Auq0STEEd!0PNnpq0JxL)R`KL>TB_rQSwtNM; z=FSsZNY+Dx%6`A{>vl$3>2HO;a8<@e4bAi-L z4aAIM_wE21LcowGEnN`pqM)ZRC7Df8DoJU}UkHfn4&Z6QaWq10>>V@FV|~Slly+^d z#u;EG4PodK*+nyJyOU_^%nhFLYv`W&38T|!GKKiS?ZTo{tf!4W7n#U!10c1B_Y+4u zIq9DHdl0QPCgfwK;>6f?t&y}XN#;4Yb_dKDc<6$SVkVMN+%3Ej&)>cpyYYh2Ha1`| zN{db7mH}puhXI5vRht9L-0l;7g}Q6~l#%TvZKiU4B$i{~$ysGxo{mKr)y)Caon7Fr7Bor7BNwNdk^2v|Mnmj(dmK#8M}oJV9C z*}SBxH`Juyj1c(y%aO**`*(n0j8izet7{5S!|(+R1jPUjFtc)y;5&8$G+JA12j~s6 z89AW6V_xJZ1q<|?k`!eB=)VNZdEf5;^akm*R#GjvXF##%%)qKR|3?yLp0m)y2_-Vhr>`B#xQ^{Ai8j z<3T00AXad>NE!7dV&C!ATJPvvZ4JnquT+ifJrzgeFWWEnbIzaY>IMTFYM}zagOvRI ziVoO&W^Km|@WK*lM9tl@Zov{lPrSZ?zkc2CCFQ|O|1ARdv?8*2%&4e9POGw(N1Tfb zRm+o=YPbIJS$ub;qvcSTy;V}R;8uIQ{d{MnYNxP^CCbXJsO*>Fn9rlJ{GtuNY&%Qc zN^`4V5&iKGo3XLHtDSCYopFwqwHB2ifiHIyUSua(#i;-7Lt$F%&+SfO$B73$c51o` zDf;3`SuvJfw=BzQOWXD=tlsCpXy4JhOi2aa-5OGdy*Ih1WqZN+m;=+cm(dH@NkK`? zNEy@if`x+iU(7~B(u;s<{UqhVAucDwDjIhadsn&A9DqljSr+*a6N_@w39YW*M%*UX z9-<76UwJ40Pzz!6Q=px}HB%P)=_|7ry}AWqa5V`ock&~K`60))BvugjL-(?P^tc&s z3fT9UO%0q4J^*esuyZ^33owbJR*kHJ8Qt{iq^@$y1JYST5qd26wzGW;vDe`t)i#%x zq&T>*PS7k>B{2D(WMzPg#se0|z>q|nA!-!*@C&o6{0=>%mHYN_d)6?*>*5u1>jcX#`g#SdK9P0+_ngk>*yNu;Gsvk1dz50H2}7VsEJZL zs&}V}uOYMM-O1Cbewv*-HlAXA_#&EZ$@i{NdBC6HD;>DA;K(K`KIoA6g`)3+7RgXp6+veNY_A z-fxsh5qFOyg}eArL>A?Y-dE5d&0!h%0MN0*;af7;LxElrz>zuAKXNegtL#@gT*9z{ z#B|o@4=vuB+Zcmt!;N=QaU|L%JWZrdO^H_h1V!eXVKn?P-$fp>kkZPiuoxxqEwz)z zs6eUsk3eGp?c!YL0JUz2EekSp_xuk>Y)?aiDpZt zuQq7~k-YfY5HqhoWxj$&f&ANhlq4kD>A07&aqG^<)5n`t&f8tJ_xU_mQQlU=lg>xe z%Og_D(_=HW!$!eg{SQwdeoH`CGHBe3#-4Y{nn2zIM*Mm)bAxU*Wsno=rjUZdXa*IMDLO5rGJ8OHASxxPD427YByq z9R8C`H!2P=Qc18VukSKpQ68R8M14u%;P&g|xlCj><)3I-c!Bk+1phSYJ`MsyGV?7A zLpJuihq+D%^N;H^Cql^7*5|m~H%2`0t9>`)Z&?8X$7` zeL_&NzT`8TvLT~AKh{LwbcE|zDp+)*X@dk$QWqey+8Ud{D{3Hk;nd`37BFbnRgtYL79+r`h{Zvxs6Bk4n=r*H6K4d z_YD_f2ofuYlm#zcwFc-$=t*>edayfNb*uea`v*%Wx-w}JXu5( zoZvLJ@HMo1xCf@)zC&O7D?2=Pe*b zbskB0@XLQieu$)J1VrqKck!N=e7}4JL{y2^m-@uc(|?T}><@PL)R5_BQaBmD=WH`r z;Gk}JpO)bG*%%GYU=V@bL!?exPUP#$jAdp|38QZ&iGANMja}c#?x@3$huE={TG#N>#ZxJTR+LQ zaS7`<0&Yg~tr^~wX*Kax(d}j6j~r~2i1SnnFlTKzvvW6KGs|EvG-+U1U0Wrl0iA;P z(Y4dp$DmvnBSr>cdP&&!o;Pf-A5g;!@bdj}h}C*hy5B{BDetyd7oA9)QR2D3WNDTn zjhmp1r3g+@F&3G<3_pZD#{7=M`_nehPl$CaZ!~}fH2}eoyWSDIkNnaJ@ z3^#(r&C^(qLj45pcG*cTBosQ#fzLf=ldK_w9dPMH1dEf#x_O$#Ex5zGP{Ybcm4tE-o zgOE2-{3^o8ywcAV>bv75`-hL!ItEbt2 z^Kk{LT@kxQ*wizscr{(bG|KZ;PK|D zM-r}mG7b~+_CtD!+5eTiYl&^!2IN%X?^;lug+bj#R3#{CtE9G)hT2YqQ{ z6Ycr@{N}&U2!cOWdz|nYD#8ds@OVZB=07ZYvM~0Sh?wYu4l)7 z<;BIDucU|N;$}V#&zKg7(^damD(7`QEwasvuvfc%woZ!PVjdH{OG`T1Ex#&E+o87A zZ?9~BH+cmcG!lyAlYMoSX4_lTrN3SsBQ>*jvtn32sK3 zPxW`$69JzdLm%#CflARt!p!)lh^2LI`&W&&$)21rny&YsI3|`5thv=T_^tC(cjBc0}T&CZ9Mz->Q&aZKO9L` zTykmFLr+f^ZHv`V%bciW{3-(<1N*Ch@`<~fn@VPt>2D^(-5O(#bf?a$at!t3MX56P zVsm(JFA{%7c{%DHC5&H&*jBA~Zl@jfl^3gcDHtjyNEHk-Ly)g8d~DU#Q8Jb5>n|iP zR3$GISLU+JMcNf&`&x_h+a)+U52vgYYKMUtx#7&x-Y44`5_!K=Mj=_0Sx|cCq0su} z*CiacMZ}L{+VN?j&id2<-KFJ}!G_PTuKF@IN93zh?QLrZwgyJR&;vBtG1CKqyPxzx z33OkbPp52sqmQIDWq&saZ^me1^`i#vs-q0Xd1~w%*g`p*Cl1AJ(;kb*9$DH(siRPw z@Rg68va?&Zx{7$>7QVrQMOmng&J4OY^5v9TeHs(OK#O{ zy*molEQ)tt5F^T+svS;CI~uWNRz_`?>$c!Sd^Mq1%a@Vm()nG|{lD1m?d11w`=%aM z*u_)kTrBPTylkaoZA?t;l^;nyj1UO>mNz3PMmwo#I8f?V)oMxRMlPJ+|my5 zZkG1F^%q^762-&D@7x?mF9U5yMiy~$rYII^XT85x8n2Bv@NVXF_R?3vYS{ja+rx*k z7<=t)d(WV>-dE=1uz@X7c8rz?+s3|%@5=DJ)vH-|S^l7R7rFCM;K8Pkp(N|tg{x>;P zG_0kXcYhu$bFw^``m9Sq#`Ot>=C5rB-vY--0ey`K7Bv#+dy*>7IS-GIXVDLeaC#@M zuZdwIFfMOVfSh^ACSl3>q*3zGr{_`WnXv6&NgT)?WmG&(6KxhQQ^4T+fd-lsq^W4= zvJJb;{d(z&ji)6p-Yq+$ReZ`Qr9wpwSm}(ey_zaqCRthE!p-8i#s~TBf{$%j z%;+>DlrWoUQTwMCuG4Y@x~giH8g=S)zrgZLlJCzDbXBBK{~kZ!W1_&wL@L`sG~|l)tD)#pKg;ndr||Y-{Rk%MVV7;5 z`=JNY=j+`lcqqn|^TZoo{kS#nTUIyo*_Jd8EiN2#QzWy_Jv9Tq*JlO%mdyMtC!*5y ztEP0dTAoz9E5Nv$TD$iV%44Q7&N%I8ube{>?<%Rz`17!nG>JS9UcN4&B}3y+47qSg zYvgt>v&{X(nR`(+jkl3U3e}*Qm4v%@e*OszIb8AIm(eQz)rP!7?@!(NAC2}`Z1>{G z9JVzWp~siSRE+;{)$sQ%xf;Pb;zqhw9-$vn)QVu`bV7gxI=GnXsNa|b1-*!-qavn^ zYtqk0+!Z#{<6glv#v!a8XZ#WY zV_s`}z&TUqZ8Yrh$dUewu_I1Aev){;^)E}#@Zvkl93zwWUh6e z@A7dYJzhx5NMGD)`6^ly&#=4~S?(J}7KG@(;`c26r9>ff7{UoV zZsXi9nWj{RBzdqAkG$z=JOs`bjmUJ?(;JvOrJTp{r&_oi5?=Kyw|-!U@7F4Y^d3T* zzK(21z1!WEsxSiALtx_|)|^%^=jc3VlHF}{7Zms=UB>r$F>yx$3 zt#u~7W2atPSC7b4U=xc=d>&dvR6IBsbVMALLycm!@v%YrLSy@?spaFHzI<+9I4(N4%v$OQ6dY&LM3xxe9ikyH(%U#wSG-<*hoR;O~ft|OiV%wdHA2MbT=m1ez?6Dse9lbkuz&B=*68guusxDjgBHn}!2b{6 CuW0H3 literal 31538 zcmbrmcUTio-!5(!K_L+Ylp0VWf+EsF7YIdxARtOdKt!Y^^o|NhXh8&}1){V7QlvKp z4IP0l`_99C75x(VsM@fIV-Zvf@XMcx~T-E8O=Ood04Lo;2xMCaos-^Z8x&t!TlX z&W)p|Ae#_Z|AF4a^n1bg=5fnx9!++PF+a5XI@gQOsm&_&i+n;Ec|_7 z2K9M!HeBzN zJ?Y&gd*b|Ur}lGq{gTDRuk!rAZ>irb-Cm%no0=n{|Ky(&((-{1x1i|595mq${BR>( zKFm)+o#I|DW<{`Jx1f930-ayNYdlOSi4>!QnvBFBN5$Z!>KA z@=V-ETw|HEczA&G%(QjE5Yle*nH!5EA8iK3o(z>aG8|oh?oY1_hm$UMs=~hPt$crr zHumODrd}Q>Lai0G!ax$ZkHb0wNucV~y8@Eb+EQ->NCNkFSa;H8YL%&X3nYPZhc^Hm zO1SeU=YK8M^%ObZUoiWdRNJ$*R-7b!ghj|SXVm|IGW+9$Qcrhx49DRaf})#x1hA!c z13LC2l_d%B@dhrK6$VZj*e6EH-YW4S9+70oh)3;iER;;Z1l<+}FCd;?Prk*6z-C6S z7Dc45^#m};=@04Xx62B|Gg78{oQbjz?T5=zdBjf~(r$vYyqGb!A^Y{cr?XwtMWCbk zHTzzvw;z3;sAh90|G=65h7adcIQZGqSxi#0Vw#=!!RN3;33r$f-`(Co$;*vgW>KuH zaO3boWaV-L+ed4huiJpVCHtuh5pyOmh#^QwsCsMM*k!RPinrwUV6ly^>&ma@lg>0s zPtWmSL6+&in+a!3eb;+-2Hi(Jtrz4iah`+SDU%A)pBBi#9gr z5vq}#xv9rsEs}$QwJqy=b<6d_?b@d>R>5)3+n4-Sd|KrE_O@PHJ8@+_VsA9KD)oSt zuRWG1Nb+kTOKw+?>L!LILU4lE6p9sxWNfP29>QeId+E!Ld&{{fFb}%Vv4*lSET(<| zp}7!J88h>0O^aDnCbB>o8KamUiws3&j9!sh%}8{}UaVTGKIuID&d4DvuV#No>Ox^G zQqg63-g~hEk4GL$26tR(nZ4pO6v*hSIMx*$Zfj&#vTobu4cU3I+%P?+W|@|^#nAAlGKKF;|i6I zVBf0qSha5n=zB_7VZjk{)bz2*@p}_U=cQ8np=W&F+^rS{CA%w=fIM_Q&7cG};paQaaSbT1o(HBJYCk3&bI*))S!o~!G zBE9NmQq1BQWccv&VrTeT&b^4%-AD*zw73jso*eZgC&`jMp;+|0j>SwC=dkT_GKu29 zT#e9aAYySuF}%+3GfNyBBzs!AL1L6qOLqOfto7%*_~uuCQ43rA!qp#o|il< zzRtr5tA-7B%a@hp3Z>e2^UkWtYd_I7>qr)KLUuZDSIFMR{V<)CL?ctM!<@75gAVfT z8sP_$#|W!DhK&_6?cHo7(E7|3?6f;vN&W>P16*O2CE!X^FmM&Ch@Fi2a+Sn%?6sH3 zK7|Kn0E3`2>7f3SJwj7_zICf$A5Y5tZpduH7#+Gt-(A@|btI;KHI7k|r(sFhl=>bs z5P0$iDRmE4JOCXlCpa5bVKy^d9BfH33=_i~L7!L~DtgW%*uAp^y2Y6-M8e9=vAZe| zafx%JVUpraYede^;rj{E@!6jyu~o1A;B$5K^p30+*XopjhCB)j zdwx9h9)z$wXq(?A%VPaDlVS7i3_?bLy$20@OD_YC@k==aQk#5oEXs))Y7jt!2OZZ0 zhffQfI95aM7WxV%60(@MD7|)8B)HilPeFn~;~fU`+m{U2Y z>bA|*TT{;wYoT1~;NQsv9!K=Am@ncaCVK^!DbcSAJP33$=o>muU*f`Q?6#t)(EEcF zj0&M~u!KBXf*V=88=O{9&3&tvwo-54~?Wb-I0N|gVzO33_nyk>mfIz^(F8WPx$oRc&`VF z#e$uuzKaPg51XICmu~z*E6W38_-xp;S@mgrm03*0)3fbZJG#zS@>j1`>IglW3iF0 z=E9C14-AABBRRmUKC~UZm$BtHQY?fU6MOBU*vD-_j8Px3cT1c|E?#?=K#0?;sGWS? z_n7w0p}OmR<31}R>A>v{2I+RWpRXU^UQste^`pWKqyuVA9G-uDbp+?lz0uP ziBbF2E!mQ0haDGuUXI_h&`pl|4$RjE_Rz-_i`@lwBAeptN^m(Cj#s8betUoT;U4Ds zWigEP#M8$0fMUGO=?WhPijsT>XNt9EM`@vhR?{n7*app>xWWTdJ#g zM!~42;T1EA!QL@}1vKW%MCo=jDK5+6`o3&Yt-LFV@NQXhru{H>`Dwgk@x>0kYL@Zc zl7zt*$Y-<8LD9x{NP%CvwO~;%`@r-yBa(jE-hNLu_nRd+2P9BkJ~8GiY;>tYz_8~` z+@h_K_jB zy_xp|yV#SF$puA?kTp~e+!}s5U)O$bY2$P8dw+Sz&BSxNmM zt&z17ofwX~ipV|)Epog|tc!GlNE<&e5xFSsy_&;lnjinn$jIfHf-HINHOGD!%?llv zD|XK)`l;jt+IQ(Rd~oxDSm@5X-H*%-U^e#_U0e3*gHNo2~+QXVl5ymqESl|jP?Re&7dnmPUEx6DNzlqsm^m_vea{72REx-ymp$OLQ- zFFdKh9N{vD@^LfW+vHe{`YI{x`nXhHdX=w4xS>iD=9>CS&z;jJc%tOFF?R;yz%Afj zlC8UYJE9T0&XG%v+j9N&b!#zpIegc;RvxcGa-&@;KOYSw+{u$773-@h-@Q-z-JbDN zZxZ1-vQ{K@L6Cj3R15(g2`&D7`O_0-nnFq8n{&5?kULe!EUPtijog?UBniN&eK{ zyaR-fh2jJoKg|xiU&r_=6~f{0$?@UJw+47Or;d^Z0u;N~JaGS$!(%I-@l8dh*9i}4 zBTN%D1(Ewz1jre@<(I+gn>`CPOQtEvyH&G2S?Dpy%a}gz^^dy*m&%jAm!Pk@!t#?M z8)Cp%bW=BvwVU^Ed3A7Bb{SHN@-YnFh5cz*Z2Pk?sOz)7$>(!^pArMI(Tts5g*d&UBQl{i0b^eL`Q+cqJi8nP$Xz;|h z$BvWzt<_KWeOyy6$AU zMSmIF$=V*sy^x`eSU-=jy9}PI-*LEA=)WReqV{++y$!O0@TBGYXB6FXB4o|ol<&Tb zt+ahZ&pcz*wMxHQ_eB$&a_esAg;RR*Z%V#?Sj>B22s)QI7MSu4fztrJ;_l+y%*tRg0cw9{?1Fl5C6DW(baAJL zf1V9RcgXFR@-(OQ?7@|My1wPsp53qhW7>X4Zy3dv`5krG4AVbtvpav4rl|4zInK~Z zu4e*O741c4&%gUdOt$4Mc|nzTn0>YFnGq$w+2kp3Lw}dgd&h1Wo!%|<7bqPb4Nw+mzbcvF?*Qb~(Bg06 zhbFzVFSBfi7S0(QwM!wmr$f*YQU4k`!7)cJ86E4ZC$)5U!vS~ zs^DPRe-BqRf7<@ugnA1_Kmf>}dnY@gE3wFl+gn*Y9?fU@-AS{nSnpF9hjjVeSO2Pr zeBhEfLT-_`irk4NZ0CnAQTvF6^e^55T`~Yu_ISWuf6B}f;Px&WRCZkDkkIVP%ajFB#DxQ%Ff_WUsS~Bo*=Ylu z`h;C)O%qV7TZQwNa6X2?KymGxhX(<#z%EhEL!ywDPqRyu8ubILny6+MFr29TT&Jj} z?jdhT-TBYIS)ooJ&qA-|5$7Hw`K?TGKXQL&-|m0}J{w=-S-PKut2yZHzP(A==|ep9 z|GmJFJ?62#_G50X-VvMi-JwWfuKbd?-|yPlp@TK;^|wntH{VP&v);;}<@tMgm3HKs zzu$(=`Z6A;Coi{dv(8nsRlJ{Mi4B_r-G_~)-glo31O0UOOOCG={S-h?6K~#Ub1>$o z-^Y8looJlwt$AP?QoYl=+5RwA=wVOI{(RT$+ZwhAl+Tor^~cnM_2yyQhS$P+Am9}T zY$Q>qjS;T4ZXT$r8At=ReX!*y;@s#VdyU>b8kw?RvyHbth4}7ahNmhEO^T& zXZiicaYc4-bqE>E-+E<`V@nP50Rk-|9Q@*`Q>NK&@XV|u#x&FaAhky7p{gcGXO9!H zu#{k$Un7p^nAk@7% zc|T7O3u~u9_O>X^`M9k8y{7HTocsgInlT5L|D+2g9NAg%ZY*o)gQbXTeAKbHu4A5Z z8|gK1*}LYiiwm!k;NIo97|CRoEN6CnHJW{A{k#A6w-oObq3H>E&!yoMDW|-7qeb4% zAZ?aGBC7KgKv}(Ckd%$~M$>?8(psmxWaL`Gi=;|c+?cPWdChXT`~G%nrRJJIpNcns zfae8E1&XDg^4%*FDinVCP4{q!{p0<&mf_lc(^`Ytm@F!}OKsy%B>gvpAZIp&eV16; z63JuLo93mN4>C(i7*)V*2JZO^ZfLHD)ckI?E~&jVx%Zn7cb}1bqlZX9i|?(q)}PeA z-m=%BC7<>EXZ&!juqkg0j5#X3^uFu~wsr0|uaqL1w$?hWODxA?vRR-mtn1^Rx?~~Z znX`4=o4U;FFn5xZiJy#j76Hn0J*HL!EQ!9{06B5Wg4x4uSShJIvYN7ALrhi`L?-X1 zMhqWBH;1|Ig3pIvq+qn=UcRw*lJ#Y@RPcZG`=|TP(Ny-ukY#g?l`W)Pq!x`%;D!8D zMAWIf2fAY(nn`iql6sNVWEoj|{du94+9W zu0ruwUxk%_9g#bJJKKr*1I474kQsiZbV$An+Kbm5hzBscE zE~{))W&~O`Pz$40hzOJ7$+kF~T8-jFUC@T-wXLgOK7alG-#5cM`+_d^b^&^6c(!RG zvc*OZov&8a@_?3=-Mq65_KKGwO0ix9t@HTBi>Ee>NJ^ltsZey%8o;PU6%Wg`^RC_f z_7ZaK0YFiov}e2eBOiSv?@U2gH;chjZkHF+T$}Hr-U?zt7w+XPD^5S^D1%3}R1+1` zndCHAPmmPHwB={R;x58rkYlOUo{M##!-QKG!cmdJ7)=Df1YY*q;Umt3P%kaDuizbI z1czNE#q`}I{W`Dtb@}=^RhH8UG&JBdEfx}Drm+O5d@9azR0~#O5C3r)dl$t>MwL5r zB=$bME};QLm7b34MLES&_<}#2NJCxe%`vF$K02~jk*Nf@vu1?1= z@Pt)zXDXqC$`|>t34umyd@0fmHzU<07iq+&^yeMavQxl^x(zT0&O*A1=hzU|nBp}? z1l%s6^tdYMwV`rMD)g;mcaSbzvL)h1xpf@jOjKQEI@Yk^5>6qA(f*q937+r>#=E4z z*KH}Vuy1PTjI)p_W2{TJ-bg&n=7c7KDc8!P_$&srV9R@0+B=sc*Ihh!fw+`9-4Sda zS+>za&C_&(y4t;1*fr!nM9yu=N`Fg_L%YgLz6~3beC5nGr^|DUP=v&&n!uE4*=EAD zfzkg8ik(h0HXRkT0FmlkX>vb1bMsu$;VMG%C`SqtZL=+84eeA4bVzmVFZ*Y1P929Q zm1Ah`cT1K9@1jGM1JSmb^4b;sh-!e~fBF#7cR2!#%05)6v|-l=ke|S(o~MQ?o%4~x zNjRa4EBjC8Lumt)G%gw|RbH$Kj%KeatY5S*kY;c9N_>9t0x57cg8xGvY$B)T0}iMe z`mDAbQ6a5BJQ#HbBCJ*yvPwgTfvACK3ouYRJm`sW0QZz<_=UM{uXLUpu0Ej27qWf7Vm9C5oM;#Yp+`&o zzT@Au$w)$v7dKRt3cXh5j-q|VYn<^CB}?S4ydXWZEW#tX$7%FDI8&geTii@3X<~Hm zWVsC9I6CJ0wVESkJ`MwN>^_SmPw=;|A;gXn@l5f4rI;gl4ZELT(9V2 z>%(4nhtRg+-@d-XRtU~*l0G|*W;|oFJ1X#ZA_w#`m$r)?B06hoMBdJrK6;ykzc1^w zbkUog?VmE3l>as7|9kfTpZ@)C(BZ$s7)||)lxSR+ITfAEm-|OiQO{$~$37Oijph}Z zT^TlmZ;MqM`40O30GG~rPI=k931Ihy;;_j1#Pa5v1}dQGoWF>xj1s+=L3TNt3#7vA?Qp@W+VY_y{JcGpp-ViP?6^SOHhDf_;W% zqg$kw-@uv-RVl6Lg_*NZACg0Xkve-(RmQ{Z&wCquC_Ym2|FqX1g%UX6P)9t(0U^dB z!yN^u0U%YZS7yKeom!-6=hrAkF9EoTgns4vLou_LPivLv(}>~d8U5ldMm@N2VW=$F z!|SWREcI*#c*4wNtXUq=I_tM(sBq=4#0m85AMx|>nup@&pIQPnr1mraX8x61ffPUXRYQ08{q^}G;sdH=SmL|8FjK3?RtX+Bw!#*Lw-e%9_X??5*|yik#l&0ER<4`=Oa{ zNu|f~c#7Y*D95>?=`PDgOv#3DBaCHfJxSWVm-x)f-+3g8SCu&_Y?Fof_nz_q@~x=a zb*^`oDj+6%&1TI8v6yCZd_2b{-kpCuIU+#f?mhfZmw}rMippuphGII1~JkPtoNDK@awD&I}>{vw*$R$If#=q_>xB@L0$cqGA!wh zVA>Dzt1mto>a9XnuJ;;!9CE0#U;J_b)HYC#vZQ=dPIjibhKXJ3xJs(?W(N1F!-CW* zx2F>47!W_3`EWTw#zu7y{$6h?h!H9cGxDJ|%~)ENk^4gb!f7HxqFoZ_y)mDqTj&5N zw1Y>(wDLprN1VQ2Utdq#d{UBw z7}xO*9rStL3^dk3j8+of$5&ReB5o9KBwjM2>zXp4H z09AdkzdN^^B^~t!S(zp6%$q!N6{uig z89v^H!U|z8V~twANfJI#h0nP@Qn;=h21NYn1M`G{JWkQG%X`vW@y;UeK^cX=itq;A zq{AcUw=N>C5E~$=g6z5M3QF&_nF&!^JE)ee$MS)kt`lNp)Cm*@TB8X=VONxX-;-D* zOOi5mxdv5+W{%G`Igwy#-@W)Vr5e6FfLgI-UtI`cNmYSIXU}v6PRkp`vX> z+#Wh8{(#+Vw&L7J*w_IML&>q^a+$|9J`j$eYlO-5Oi99YVFTN`%PWlDe#Ct6Wf*zV z@GSNtND8Jii62aqa7?yk!7#OJ&*V(rs<_I&;YWZva^GgFUFOv}kMWFmctsQLHJN!F z!N0k79)XSI*MMt-Xl`@;#9AX+XNr)>RaSDT_9o~xO(c3Nh*zru^7UBzgvS$7%p5Zz zL|Ea(+FLnZqN0|Fgb>Ns-^VN1zZ0_N=VXKcP7tKBy^oyF zVwq6zR}Q>2<(6MjyL(0Fm)99S%%-|X>nisq-6WZ+@78(dh3yod;oEtOnxu?6+CPF~ zGZ|l2!!OHg4^Bs-7I}gT5Vet07#N6c8ix3YdMOf_$)nT_1y# z5|dUU1Z2ue<~$tll1rHp+c(=sv}wZ3u}0TPbp^4NiuRTe@gg+P&W=nL&Rw8(jvFVO zr|M|iHPa>IvuKUlDEDRlJb5fF^#KGT;wFt9c)(kYV{2M?ZLRpgg0h5@Le zJa&x}TPlcM5lQIP`(+E9+xCN zpAv4^PIp8<{T(7b$&FD&Yd@GbZS=Le#FJlZ;P8=c;=5S5QohzA___WW?SRZBoj%xG zOSWT2I|$1XMCLmzh=~hF@ar& zHF%obfnp0&v3>gYh;~t-!L@heVQI#VvQ~9A*8_Fo>Q9YyO?XY7#SubqJ{G2O3}N>h zoWd)Tmp@?D`ce*%D&N)gprWV12w2;&YJ9kAKF6ve{BgN|KLHw8dpD4gM~Ei{61eKw z0=e5K%Dk*_bUjq=344k?WeOor3hd{Y%g7M}PcX0Z_w@izv?&4F5Mdjql#WdC=S_WgXLaxL-4vz#3b+saTv2z11Ep6)rRhX>jts0HCq9 za>+nrI=jVCe34}lxK|ITSzTJ!TkvVq3D#hd9xa%1%vHyyAYAXn*@o@UHVSF7%Cvok zXZWzr?DZ>EcA2)_k)`C5b5Yf0#v~?OOKer9a!mRi$9fzMYb zcR1O5Gqq|eI~-cyZYH8 zCt-@7-nTW$6nGGS4QMLH6vp#baA>l5J{l zUpESLUUI%>l16yP=5zC%44-^hHKZjTqNptGoNYQ#W1!J^TKeO%gf`IK03*8ImTgb} zqV}Z+Nq2o~?cxWeX9lDF5UgJN1X5hGW!Xf|=UU`(Ca7Wo=>+CtJ9M*=0B@^{|PEFIbJi2kuVKk)h z4otblbg>Hv(sef!(RB{tpS7H?u{T~MepdEOM23!8`a#5JYMjC+WX&982;QOlp$zAq z3ONha-H%3&Q&f{IS3}UP>xPMdbpFTBHqGsV9JgqrlLspdyr?yAo=Zl1#bn9C6ku<) z5m3`D4%AmkY`6X#lWjV(b=5FoOK0dajjUOKF8VkwKLV{l^UY~@ieI2qcS?6IQ_Dfi z&R@d_xnhwTTeAp7z*UqPGCvEztMDMlQw|*ODyFW$X~3X^)bnmIO1G9HafTR2YFAUP@!wecY1cZsj?&2gT-2tT{sW7D4@%0B%+ zm<&=VtuLGeHZqniI2*hrOJ7+hzvGFN6H0v>^`>boI!I}8{eo7@fzoMzUB@R-92;X!wWucXpN-Rv|; ztTs0KgCqgiefDT?!|PI&P!^~G{Q6+E{zDbU&JEU@3=V1P1N$d1X_BHM-9P`SVab1q zXMoUy2?6B%6Tkd9FSQe?$Nj13KY9Nd+o(K*EvLUnQweN)S34k@0?q;y%dzB~-RJEC zVzAzZ#ouO;)a&);dw3ZFl(&&xjN2EMZ0*oQ!6dM{LMXpIh{9wTOc*d&y`+nC<^#l`jl z@N-7Fe&gM}>M#qt87+lM$S?u3;r0ssL#VO#FVD5Fzl@|DfxUNlXBYk#8Nse)^hNW- zy)Z$bvb^8(D2%wi|IiUC2{t|}HXP!bT`J%H9Q8k_2mhu34kZPkYpKDqp{DozWb>@R z*!KIKRbKK(iE97qod8GwM>Q_tep}-b?SfMKZ28G~#O3HdE&o-!^<78J_3KMip~2C2 z$!C4Re3$e$n133#6RyyDI!PJ4~!MF>br#5c>@U~G3;!@?@mH_Dw$)u>Xr{eXi#kH>SE z_cK=>C=Q3eMDiq@HnU zf@MthA;(@UU60wpZ$gUNnZ9VRjrAH_UiEHj{A^+{o?ORjW9Ps2kj!jMlw9;&nX{hi z94jaPD2SIzx{faZUzqTu0VN;sEhT;2Umkmsy($trcdNxkswD#h@$s8mFIqE;{bd5DI4+v7yazw-`QU2wyoD z((f-~L8aqBK>dM}e#+c9O>xesgf z;?z0ne4p-Utr*-0*$zgg^=e^ z@Kspnd=~Gdw$`H1$&qI?e067MBH_O?)r|sevIu7&%`l|N8XNJmt zgfK=pgGOiPct^sJ*2Oi+_1mJ#xK){k=cr4QF^IqYWmDg9DN)dGFg)`Fy(cq2+?P8= z{P}9q($N$JxIFSl&Bll)_$4pq2(9+3E*#Ettwt`Z`e2ERcqV!;XEJS8TLOODZAG&8 zW4NE7RRkmDo7nTM+n0GBC8YQU(DKdVcG;C4HN>62*s`v84^j+@+M|bmX+dH5FL39L z%cO+Lh4*zQ3118PNPV?Z2Z5dyoQ?{sh;WXU3G@fF>21s?==w|aC2bWnFZ=3zG4EyM zSFn=#XUv-TEDW{S8QKd2+wc_XsJwjNwtRZ=LyjE{I-M5pWc>wAubnTxEINIOZzrxA zKP;26=8U-=)y~5945^tW^(0_9fJ+<$g~rCHCiVLYhu!$A`{6(Ks+Bw&ipKyAA_~65 zzIBU-$;OP7W)11MjO8(ja2nhBrn~I5_SU$VkLV`ByL}ukp=crW*xcg$zUDK-O^9kZZ~VpB-jbLF$^m0W?b8 zPLA{E`@6;eRf`6-Ny3c7jvOR~C8|`pJE%!bE|M29_z;5tqMs-g<{49PR%pyOo$lYm zYHe(1lK;mw&VY*e=a>2U!G}5@WtZWX zqHeb+J?gyL5T(fn3Y@%D;>3#v6?O;gvOa8SiTdiKF{9fM)TFlt$sc(`jW2*19B3*Lb3Ria1`xUX`D@9}Cqun)Gt%b53*302ed!R4& zQDVv8;zaT$eJA4@%HOp2i6Z`R(@cnj(#5md0XKQR{zs7}+`(&BJ&ABp=RVij z^!m>fPD-=6ZsU>G15NnD-BKs0|J#562O0d|6PeB@?}J}tmwr*rsuj1Lz&{i=fC$vB z#oe**!ldkhsny)O`44|RIBv7IR*p!_uzC37hFqcJ_|=0;RE=J3l?ntb8&Q&Zt5kBk zV7W_cD^{O6tDj7pUy!=FU5&tI*w|jQ8T{hELbrdCnL4Z2kACTSrOJTydq2_od8h!I zBxQeZ_pRA32NfJ}4Ea797swz}0ZmiME9sQ~e$@{ZXi+zSmL7CcZRGqxwMN+QUu2^0 zQgd8f&`-A+EV1nac-SlTz)S~VC$CjeOc#}r{o=j(EWs^eU}IrO>9*`ooYEnHS_315 zYWAG2@bqz_slJ%Mtn(o!4kUqYp|<~D)|uK#K=KbzJEW_DAx5p!U${pda6l3$M+JYt zp(+1Zg8qMtg}+A*7Fwy>FV!6Er#Op z&&v(7$=-`u7%uth40}u^=KLBjWdJUDiZV4^AI9>Loa~*K{Bdm+pl{7$eSiP_R_)tq zyZ5h~ZtHL@&&DTvJ?p5D81r#n^!W`?z~E#`tQ-Dnfl&(wyH~QVB2q{7)qw4F2~R# zcR&$DlIrlQu&pc91zZjllkcn~M__5+T{RL>Ye z{o*VWVxQYrP0}6&p1&YWV(*9pjF^ibhD&J-YiKl$YVtFX7K`G_435>?Q-=IP$}sKC zw`l01{anrTSYeGjg;!A9P31lG(sFIX>j z`M27%cbSFrtz*dA``l5+)Mw^w01;RzA}yX+$nr0G(h}f}Ri!rJoBiJ)I5w?PR~jku3)z|kQjub6SW3Giri?AeyTqWj2nzJ7%;Yy?PzH!< zJUHpw85J5*V5xF8(gREj>p(N+DT4xaW!37SCv=JU{$ofJgCWHt^7IFDy`gJ`=z;BQHdE6p3 zeE(;Tvq|)kkjy(67w%-_v)T|YIE?he*~BKMh>3Ip)(ravK3V%B!^B=&ga|F=wgacY~CwyBcqgmOkL~?h{T3258ARSo8 zBS7PS!W7;B@(F-rK|S!dPE4;eAFV0mM5KOZPY>HzmbSzyc+|t-2V`OV0lyGWNVj2ZYiB3 zTsIkC5Y4j`l2O5xm1SJ3)o`xh5H|yp)@;Fe8)V#6?j%{)8*oS~O!HxpE2w$r`_}q| zGx*rdQ;m0R4WCD@4iwjmSqSi4v%P{v4u9v@z;J`3Y+kxrL_Jfby_9%{yO8QvHQ7+8 zX@dZ0&r0-iUltCquy_=>Jp)ekK`SQ9rj2#AJa@!%52Tc~Ej;EyuT2E08oXzWO5gvi z>O^^Zx-ntOW@wmrEVTM|j_{&A_}1UaPelzznx9}Fzun-4F$F4>tTlK?T`}&r_xvSb zp=9ri@C#9Ny`BKLg{8u4sC@35iN@m*N=%_-jxb1>oPl zU=wp5UJGJOVb%_9YF;;&u@s4xxe7M1J&y?YZ*9IMtMg~7jRLbE3K-ePc?0hVkzzQ! zur_krk=>cd69#NA`xWkyo=-3pC7-}2YHfQ1%M7_E0UxodcxV1#AOl8-UUvQVCvG03 zQxT(_9VzfQI{_ltn_K68W;t5zfEn3dad(7=m^X>C0 zaH6-^7s};S&wFr2Wr@RLhs-kYK}DSF_Jtq>xSqF=S$Yh@>bId5Kf=jKlKnf8+0)oYFK zb8*hwG|3?2?kJ8qFZ(5vb_t$mR=aVjR&_T|7@t`>ogDHt9lPfz*vJQ$bB^9|Vwk8J zUWx=v(p2d5_*`H#`s^u-V6`Eq&Y1^{e}^9cJa`r0!XauAm;Mc`RE~9Ss?{5{=ehrT z-#>!?KVa)08}5G~xIeh;4?*|eW3>M$mRF7(tZ=rzo(#a_(!Z1(72Ex%S@-Y?J7IXd z7=YgwOb^b6{&6kGE`oI&d=jHdg%QzK{bzx=)^s^7QhAHQ$wJ4{j;8?`dO5-f>Z-4WgmTJ{~q3iGkz z`zy{H0pgk4EDwO(-Y@!IYhUYFs7&4He=v1zGSpQ)CKJ&^l<$YWU=2~t;ZD>O#4MGn z%K+AWOn&kHcy4Y@i~MJWy{pyG+7A5j$&{JycbMbOcloIPU-K$@4L=)>{h811cKgz; z0DH;jHsac6q9YU1R5$$JK6kv1y4v7f+rPQK`ItY5{nmda`+zY;#q&Xc37Z;^ z9VY);gb$N|x#i!U`2UiC1K}7@fI38|gO*^EyAbb~l)L|Ph)eL3^%ypr)7#TN`9Rg* zHE(A_-uJ%mwvc?L-$8EHY}gpAXqK4fYnAA^F_DkuoOSl2*y863{U>_5a@Rd}60L(* z1e3oNVY~jhFtDPVcH;{dQFI?A2iQKZKYORsOSLIqlHVeI?W@^f=sKhU4=u%%*0prN z8oZ0_8QPoB#fg`0Q#rof12-!o(f2w3OOWUqdGmlWaT1`C=uR3C zfToDJO8piAB3Y*5#;lZ>n4YGVmYJ*N?E?||xr+@!%fJHz5RBKD5u`QVV^lhinO%tq zbmbem9e$Uq28XLKHq#p9XoPJ7bYEiC{+z$ONi4X7g-4Usis!*y5`1v9tlGW0dXvf< zo@k=FkAbECEVP2j2Y^2~!8?N6Cd?05bpyI3sR*Y)C1$loQubN%vnFs{l4h!{9;S=O z2bucriQ7}^Pp{giF~E}R85VvSIQksyy)0H9Y+Wx7Bcn7iG*0%5(th~QtU@iur~?2e^`oj+y2+}a!7HGBbDo+p)w5=z;YfnMF#kv$rCl_%eDhQncD)AtF>;4WGmZjWVYX7^ zlWZ*^-v?%ledlu(*tuWBHd1kZI%5ZY@}KYFRPWanA$$4SO|m1TWVY8u=m2#S-Cf~N z8L*QY0?6P8fU&rbUXJG&&!vVhUsqEWyM<9^XlxX65cr&{==( z^@FcFsgPf~oi~kDzGY`I^2w}Mhg|9dTKwe8+OTLfPjlcq4;H?5t5a{r-rQWY_x*m@ zm@*!pN^+<{WwH?PJq9;l*_R=fG+Cw9=E;yImAA6q8)sd^51;c(+N3~dj&@kbIXE0s z{wevibv0D;9I95*ufelUi9+0eyWL$P_-KU!+m_!VZm-{_xfpol<%>QVn%TxBxEFE{ zd>wo=H4pL=_&mW&9`RT2?ct^Tymj?d);v!O^A$9Ct^N!l^}1fZ@x=WNS7t&8vNDMm%s$)Lu;DF zQQKWnKPRWc(p$H>)Q!2@VP>Pi`c-XJcvrRW?xj>I*c8pjTS<}DRtile9eLuQN0Zxv zSajvuY^*W;%al7mlw}87(0+F6L1CqDDdmHNESuzeu@P(%4~TNR^Ir+jS$Q@2Qsru| z5w_Kw8Ba}L&0kLX1gaE9iw9poq=glDDpkBYMdkpergT%=7?2?v1lP5LgXg>k0~ral z=p%0v?|?2qdGE%&Kr6utq)lmk9hZ_wS+CNChp6vwu(6SO`MPiVziRi-;frO9MTU)01sbJgv6-+SgjbqLKq?$2(MQ4;TT%g zqSDf)bJ_)_glq5<7Ff* z3Z)ULb}{AgIlySAe@}%ufcp86-6-eo7s5@7K~hV@6-9Cgc^%IDzPqqxjXa^$=++P7 zU|m|i6l_OnoEdvq#l29$De!90vK70HBWI&vTm=qAHeM5>sD8O<=JzJK#i#l8-Ttdo zN+ktZ38>48FPk*eY9TR4P`5<``R^nmd7KCo8X+FZ;qMqHU~QQW@|UYLgkB#PkQ%wF zwM}z1`Nea^=b$eJEmI&(+s~l8+Wv4uni@ewi_{IkP!E+-I;%}jb~0n&!8(l;B3mf> zy(T1=9$LJX^hOZ|kOLiEPIhP0@nJ_m3O*|s=TMQ*+KWH$y}C_HmW7|rGK6)&(qYA5 z2#FKh4H7|V}w^a!H>+tfS zpXILfDN!SHIQ-^&)Jbm#dlJ34;Surit!Fwt^8(h!rLsd4KYFH9;)IJkoa7GbHN)NA>38i@BMeke!7aKd?G5SP+{2rb#eY>m?cN5pB}wc<7aEBBc|7fr^lQ`B!ZzCc_3NQi}C^u1KiyvUHsIy(w^!{0CED*e8H3=2n=;ndfl1LXXSG+JOL8FkdW zODC7^FP9=L^W5XHo<9+i22&Wqk99*)V2P*=@#J!GW!<{jBXYziglJhrkaFwWPLU7r zo$b&gg4`f1wpM6t%6?iVH#%%@}n?8e!@zfoN- zB^1$8b+Reu%PM3eJa6U%T^aE zX!{)EWHU^cx+r41+FpMvRiT2YleEU_^A+EFjo0VX)VC1}dh!P2e0=`1_q29D^xt3} zj%T&y^fUX+upLs#N6C7dZ=Y56tw~`oOPzpcl;$un+UOORooo)m#aCF;Rct*9etv;A z`VT25rzhw0itMvVZxvd}A>*ruuHXdX6G+*22jfqDWUV})?sS#^1Pak`Y3oDzje|t| zRmU~O`j;S9Z5r9iy!yd#W3;3{qnU=+62)!=hxYq!j$-UwO$ zhS{kFVwy&3nDdCs+H47*nr|SW$e>ON&zUF?UT_kaQ^jSO^f2zHP|U7MP1__DkfIFU zqLGURkRCsz+QeAC6y!-E_mN3#NMy+>DlR}P_o*?v*pgCGyQlm;zWyEhD3<)NV$J-t zlr~oi&iI-+QbeL_BTbP5Wh-*;qT0K-^E#fWb6f+E#w6Z}%v3s}@(r*?YR~e%zI{iI zMhBlY{96vrzWiH8>!@j$P|67}*2B+(q7%uaMvoSGZ%o? z(X$BrcxNCo>RIcD{4G@n!8)`RkjiWVkry>IzR=v?#MRHY2$pV0TF5*`gr>0&^eVIj zPAI-%^QEhuievCf*0Hyhg@lISX)B+u*4+qm)F+D~GDOdj0N)3N^hb=igi^=B$wJwB zhH;>-tVyDRku(JvfDF$KfXiNrvHm1WE+7e>T|9#azRI9_^?XDqy9EsMcGANao*Ihj zJ(wMlM%Jht{`#g|*T9QtwuSQjAgJm0uotd8>sLfb6Qttuil~vFmOjt_Hs-C_cg`?w{ z^&VK`O!e@-DW-$B1V1;s%;x9w@s^rhB=3O%Co-?JL=V7u4XD+JOn{AA?+~&xz2(bY z$E&1Za!?=-?yJ=)D<0mUf6?$(=wfjE!IX}V8;+74R=dA4spU?@HMY4&hlep!g`5}| zf4MsIE1Ed_FJQgEhq-?!r$0xyP&)udF_ROkp8kIaw%7zl%kQ={JebtS20+*0532mh znV6amo&HMrDQfc}Gt9dt{o;Vg(9+j5K(vC@vH;OaRkOOCK}5Y3a0@GW{p-H%j_sd1 zC0*)>P<5MOkI5Z^KOz>O;05L@8|w3N26zv9i+3$t7EZA+U%>Na``Q&UC8ueC|J#*& zMQ^I;0RGQYEm%_SV!2@Ezz)EWn(}s$KXI0_VY8jqly+YF%kH#mAFB{1Bb2 z1En%Up!z%?8?G;53B+}`-QQ1_-7K|e`>ww~W4cE|FM7Xd*JOonZa7CP>&S3o-6MW} z4r>>PCEl?cj%q$hEA#rsM zbsHJwLKi*Ndn%d_dK$Icq(ooUcbax8JJoHTgWUVxTlD8hxj-k540 zlz!N+WaK{@(Lw&?nhvyIqYW4wz3-0*BK(08DfC5XsdQL;u8&2m86Tb2KM3f>Vs)&b zCWM8-tRnp;bI}F4e6(Cp(A!?W`Ez#$Dz%+;7X4GemsKfTn^wpsn_e_@hs6-V*ww=M z5I-=8(X%|f51tGH7Ribf$J@EwV<6AxfmqZK57MrHysTKWO$9^xFt9ND`z0fOrl|*K+vz;vAMlpI01W6scQk{oMF5|irO4RZrTRIwE&5?+mn>hhs z!q-3jP&iLDBbc40meM`3*%8l*RBoFQG&u&`8gF@)jZwPX4UOqq2u^WOVc2FX3v{kC zuWVdYcu(Fm-l$`WL#!<|bju_1IN2fRu358thCyGL=d7TUN%-E|jlj#R;YTk*wTesJ z8Tl<1+sk7oRCYs4=Z^#ircdc@Gs3I)iz4UGvV{i}Yp_HADer#x&&(wc9ejqUFHutK zgP44^JZv4eF?ljo zt!XYEvTS022eaL@}WHFSS2{5a;v(!)H)KWB`!dVV+PDahG~iUPnAyJ zfp}C3KX)H$=KhfN|;os%7@*wVEE)xuqQ^YD?dE zTtjI8mz_8xv;T7{ZK#F|NEVP5J~-UrtwV&`mje+O-dkvo(2)a}Ms{ncQs13LENFQ$ zI?#8eju*!rHXH(lhR6~F=^fl>)$3GGk64;-kAuW0NRQ*W{C;nJnZ;bb$8)@};JO?X zqkbrNGV5U93?KfG3I*(m`mVjRT=|9=j2dPSU3XuKkQ43?V=-F;89Na?5;RFrfum%w zc;g&}?ycy_TS3|Q5N~nekN_ZBY19%LF4UeP_mHh(!I7z(`A!Uy5b$>7O&lINbGlhB zhhW3WQ3M@d{WkKI8RT7|P@pI?5f1JT9T$3id~wu~p@*SpAM!o_hhXn(5FQP$xkoz| zhGo%LukVsvZlvq5vdnNAHn9Y_F$#2F;>oI zZ^xeH*}ohy*Nc2UmVhXIBS$ev}lyt0~vck7NiFTzUBPTGptYn)T+Epx?DDLxc$@&|!!K z2pW-13yue2^sVm}ki!!6g@?7*zEiU3{+Y2fpu42FKs!PXst2KV85k$n5VOKp$YZ7t zGCO|i-wyo~`B`b2wwq~KnEHZ`0d1#Q4D=_K*SkWsbNRO*)+#^=usi&O4o_6+%`nP^ zMXs27N_zI%<+ef7pp+Tpb^p7h2@umLHl^cs`<1exOJ?>`QA%yEzAQfoAVWlKv4%zM z#h5V-FYpdGcX6SDxKR%BF9)%Dhf|IEW7av9YLb~NK`#i{n;&& z0f!nqq@lk)&FZ@wkIhC-A4GwDhNjVmGiu<{HN(AKpHvtITC*7cVnd~lPg%xI-~HZ0 z@+r2LsDfQHA2hq?6$^)%f7lzy4{I{-3KC1VA;$DH%%+TTB!kk(_M0q7sEFV@gnYWJ zevEB~ZKw$aJMsftkQWs=t9*MRWBmS(>1>)^I%&~b7~nfVm2!eoHo?ke+RX5Ws*f2F z0-9S}$B~s7lFN{9)n0&Qh5Q&zbTONiz~+@fzyZPoa@?q*xOOu&d>94stxkqT4bokF zyjQtZ5DB_z<%V{cKu<~`OFela*;m)gW?G8lCm&Pb*FOD@Ekb|eAh8f1keI;wc%Ew0 z5Z}QeP@)5-V)UsbS&TvPGqy%dsN+j~{w=Z#@?HwF8Jsowqc0+sD^fg}ZXh|PoDA1v zM&7cfpc#Uam*sLH8Th0EL{sv0^3kF5P38ns8$pA}W0UQO%;I)X7aIQ!A3wPZ?@F;# ziN990`HD(iqfQrENAZrSC*!aE;+TCR6LhsvK76{S{!~11->~B@HtN7eMSbr?`-}7c z>GAy*UzRgRHO=~Kw#@=a$PbZ%>`-&4i7VNqwI8HBKbRYRA}k+JJdqFjO0@X0VqCw= z!?nkYGa0IWD9%2>dGu)@F0*u(%$Q?9xPK0ixJVw2b4y;2=4GKC?Y+zU@6B({NPG3J z4JL5g9ZJSJH?-GNDJ8h2)gc!oVKCY7fuUkxXJCmpi=*&0^Uc8c@=!Pv(%yX8JZ<=}8tLcfF_l{=NxL+cr z4c~AZcTaU^WXVtU4E5mQ-8bnte5fm&?1w*7$?;c)s{w5lybANGpq+Z@LpGs$(7GVg z_&HROCAm#o;osA9>*;OM#~fw(hTKS-Xi{-%4`Y*?scI*u$ostxakBqqt6UM5>+o*g z>=ITR! z68~wEX}W9na;_x+D!yY=DP|WzImZP9+nfJ&Y&iw^31K$`OG~A>*3)}0EJU6Ac(4>J zD4^-sZ=T!!lJy_W!{}Hcp~2M7)BjMcLXI|-CUIhWu2pgu$Ra&~5|{_Wghwu1lYnZ= zW^vH9N(Q?wUxa}U7@q#0H4w&FIJCu^3mTAX&2ZQdObeLG&89Itwg{KpUjWoZ6`b{X zcE=j`0xP82O{~$_8Rq6iRb<+6=Xmw;_eC;M#}Ao#84{wBn{Iy;6&E^&t~Fr9w+6kJ z

Z|6~FUQ*{LdaMH(+^PrddN)?6T_SYn2(>|g*}fTc9_CMQ5HVc1`@!|73#JlyW) z?OJkfC*^p8)!zV&^UNlqMqD=u(1r%J`j3HPVnrIp=RbLXl}Ff}_bSQdw9`&V8HF+0 zxailpzOKxl_d1!zS|=^9PXeo@zHhH8)}qn7Uw(HP9d_c$EoYV7lNRTmHyl=($d4=K zUr%^icQC2xv@jStt$cwguVpngv#-qa=;xp9TBJO|JIB<&3+TPa3d4^x^j-MZMu|74 zS|*_Pr4lMz%)HTRCsf2`>V4M|2`RZE7HNRTt>AFa`BC~2+AA>u!N7Wzm)6Q2KNPC~ zS%p|k$uH?cTlLY){3SeMJw`_%IZJtr?%Ht|lqli}DO|obTUA#P#r})DClhUypnmfN zNKKZv4Bhwj)u;TSET5bB*4pbA%M7K&;xZM6Hba56lB37&Hv(H)cJCbX z4oDD&W~*x*no;*rr=6FtW38Cbcb9bbXq{M2LlL9zdubPsi|VrF^yMTL3#jCd@c3@7 zW+X*-t<$xqz86(AI8f2ez=%YlzZyk{@VzaZelu*?Iiz2(CBE8VDdr&yy4E4Fq%fqf zG%?4&Rjz5&#`IR1I{gr{Ih8Cf$@dTDpu5i1-GNb&mXmc(S;SD}oyxgJgnKyun=$O1 z1V(MI_g1iV$w6-yg@QZ0TC!$GcNe%@DL%~XGd&WH>HKuDOeT%5= zXqnM4o7I3c;4m?batVPXM13#HN2Nera)j0h(6N=9AGV=&>nR7XVcw=w@}d;;!MU3@ zotu@Mk9kvHN+ne5U`g*j4W8upz2(SG=KEHJ^D5hIr|+(##Req&3C%PovVn2x4a1%? zb~c{Qm>*p!DK*;ryiScwjf}fUWft365y(ut6Qom=Sp1Z+o;ho2lW1uVs|c~^wUatp zWZ$(O?i7euLQeBPjwlSZ)a!`QkD!ukR1Ia8a#J8(v|egJEL>^kY$TmQji}LAu%%k` z`5e0vqg~Z>i<2hgF4-r~teaXys}_TTM=7&ziZu+C*|yk#7Vog?1p2_=`&06$ zQMh{g=VmiC~c?5e%k@x zC<{p0!7MmND*Ljcp1?vZ>&kTe%%It)Uf##N{&+xm3TJ(=_3UQ2$7&TL4`2tv#HB(M z^H*=9irzmknnE@!;y`3CH;CAh!OX_2csnKbXBVyjoj~ze#Ctj%Z;LB4QE5S&MB(#$ z#?T4<_w9{6N*vLt7347exHLcA^cVQZY_|~__Bk>P)4MIIxxoJ8?di*QaETUmML9X6i4I)>B^oN8W0NW{JW! zt6)o~@u2ZQBa5r}-dYBOhNM%A2AjNim$W=f<<{uOno#=!LI1x)vV;o8)k#6j6gb)9 z%S1Pek`0`x)d!9nR6SSe${nu%)buCYpMl0M+9Ctsuy62ss}2@mLJoAlsk~7)t}>@C zR5|$h$OLF1lN|(zMoytkm}lR(#y|9H^q}<{1{MA9bWY9onm79|D2&WOMjOOi zeJLFrxgrqn;q?hwwsaAECPY5986EdhKd5P-sd~9A)5egHr7IoZ^cigB#G6XY80QNk z_}lt~l7p#QNwbQGb#0msX8B&xFgD(rX&4Q~2gC{{lU76Bvy0Iox=KZH=g$6>0TPpK z!aoLjg11rAWSeaD6>l9sWzr9TC=ecRFKw?+DnZrt4Az)qODW!|lztmzDLrzkOXJ{e zEF6up(7cRodg6V+^R(R=n0v&-`P$Ehr$-y@ew|pXhd_FEs&8f`ZpM$v7x}A%AglTb z*M8H-`e4iJyDi2EmPDyx)~eQtcxniLN71!l$zW&m5Dx8LI$N4!hrGuW{A#Ouod-;& zz;l_g01x}xgqG#+A9IXGaR0chk;5e9m3!Iyp%&0@(DK|k4pu_dFz8e@+I20MoU_f; zK<&)YixWslwEMZ2$m)3svrwIamW?@TPz}3Za}K(s3N-T2rhjNL#%EkdPwt>N@a&c3 zSqp+&XXFxDaFpGQuXlX~9EL#~X*Hc7*VS98skTZ|%^X>E0UT+z%!cjj#;FE9rNA(w z^5lFqN!`-G;!V)aZdpR!>J9Kw48F=Qw1_=H!-duaL3%{6iB@ev@dop0LbvN`b=nCG*RyG>S5(9MRF_`8hX0r4s}xD~=oh!n!?WrMm!{*Yb(dPX#3g>6y=J-((eUXv{766}*9~$2(+X{inhn|% zrRJREkOQeMyo*2wc--jN;A-G~BsFb5mB5X1y%K;wUg=LJL))Ia74Cxj$)1k%X8VGJ zfiAS9B!_z=ow(>lg{}RBl4)!aoaI}+X11l?C`k#~6#0HgW5NDGaE5UMa5!s=5q{f} zc-3*&YxH}yCmU(4IT!0B`*Vxj>G2f&S$1wzywbtvRi7r@O;;l01ZtDP5<)tYKA~{F zxg8%y+-YdRVeRJ^Ab}&8Du7F0+h+Dq7Rq;a=qV+CeTET3Oa^1FMVJ$9Dxc3VRKD?? z$~r<{`L{RXWr35$%L5igH|ADWX`=fZ+A79scH!7wS~Kf@zn$9eM6 z6UV{*$FYv0k`zapwd$7rmF$CcC& z(buli{bZcvO|s6)XUdrts@d&LB}s5E7qj$=yoFEe=T8X?%TssEv=FO-j!GV|HM1|J zj%)a9jS3*`5{}*R77(La0gsx54}2Y5Q|kWJABH{0ed@=s_u#7%oK8m2=T`e?hz73V zp^o8ON}Pb_R5KDzm`T#YWOvXQR~%JeY@;9BWkWFb5wIHaPzQ&x;v`HEN;Sn{ zWa|n%joy`j+Jk`k$Gj=O>?M0!uHUr9zs+A(-o@r z*9%;&3|A~EC0j_o=P^2(SBCg*+<6k+C}7!i#Ky8*c{E9e zBhOR#cxe#hJpYo7?#LBS7kgl#{2JgjjNOW5wPIC;ve_jCKpS^mE& zwTLjI89eTSkq8PFPP|pFna?i6y=->7a$zNIc4tZ;HU$v?RWt7)*LCBZ?yp=wJd0m5 zK&XX%UEO<}86&WS3RVatUSH1{d*gR|4a4n_+hJTO9~x^|hv6o*POIhr(Zlt4|LX(_ zaNu80(Eq`=|9czJE#+%jM&qx4fc+L9+fC#hGwOl>n?c9$Y+-_!_N�y-R~i7k!Ar z`+%scc5VHL-O_So{z(xhqga%g<`m)AxYCV8P-r>~b;JVNdK|!ULPZz%Cb|zy+ZIR{k&X_`iI}e_2l* zmeCvwe7)G7v!BG>!O{#v5#qf%>OiP|V{gzT;xuLYg2yOgtWf@n0JOolo?T4_C=E5y z)7K}uT(VO41bg_FT5?#5`<9Bjys7Y?Hz$FnVOxc55mp;@k(BrY+y9?rRR1%E1ltdA z3a({oR5)NBtRL+zC!vLeg&A8etpi0^(&{+D$jC@dLG97!b#z!**b}4Nf`T%`s>;eY zX!Yu?{d=XQrEiQp56$Xofdy8j3wU9Y&wTHI%HF=GKf7uHY zbcNp>?f^o(ts-SY>~Ei@`hJX#>aPQ~BG1^zBrR68dmrS+_j!zJ-PLibo0^)&x4WdI z3b#6w#0CH{ut+cQrg;CS@I>*E@5iFB6)(d>HwdHGiG5u@@Fv@Bmw-=7dl5l24Z8nr zMW}!}I>E?bCagu$VOH6zBc8XxpDfj~8zueGUfh4!a0i)z}x zDVv4eMRkC&*J8N6A7(39IyT6-2<)H2c&L(JPX~M6Gm6t7WcReP&>aH8oUwCpyhASC zRC*v{^YkL1_+c#BX~E^yW@g`+r&qSdWs&+2M>w{_zS^)~L5R>^&Rdr-3smiu8X4B+ z2R^)Ed%n>?vVQGh#Vcp$(iA>c%Q4j{av4YN$XDalLtTcBS7%DPrER9zkNngfszzdO zYI=75^ULex`>s=Lr2X_kR*L8FF%YF4*@k6=9(acimHSAnHVNY^0-gmHN`hYA4Bpe^ z-F=_XuA9ttKc0Ojsg$2`W;@CXmDjgow z6m|aOUMf0ea<`R&4Q^9`JM;oU2L4xVjQ6WhYjHb?L2PXBp{R3pVUy{5sB`&_RsPb5 z_U>UhI&&Xhd#++a8+Mvs=rx=NXVH1HS~B4I)s&Zz9p|xcYeNvp{uRS8*|8fp5CIPLu$enmNP4zl_ix?EwYqg_(xCp=T~lj^#- zF$St%u!XH>B|Xla5w~4We*UTm2nZPe)+Je$RUiB@t}>gQu+^Ub{=rqRNmfZ#J>{e* zz7R=sLsv{ptu(QZ?D=6LVzV8*AoU703?j~__}o2bTojc+WhxdC(E z(;$5rbf+*uVf0qG!RN-g72I%x99)FC2PL}n7pi0D&PZ$F`GfUy1Q;im<-W(ntza&N z8~q~aW|7mzTiDf zGvbaeJL9Xg5tD}2WwQ(Me_Z3b1|?x^mUwbN0g9{jX-yJOl|0fqMVd)_$Ylc@Hq6%4 zxreD4hc6+?=9DRni}@PmxbH9Ab&~gBzO!{wk7wGOTIYJkoW8LBHRBB*<4jf5Hzf1M z%c({|Qucy9BrczrZD$ubH{_wZA zWnEhhi6`;()sw}CNN^hP$1~fCzCDAIQr#dz^Go__@e`14F#p1jC-AMvACL5T9*uCI z_*9D*;2rNRWuZF!(GaKO$dGmEItHnMvadSgygMpm-V}-zM{Bs+q3V5_r!W{BF`qDx<`(X zdEWZc^hVBhpW;OIKbdDWyfFdigM)R#d&2L5hM(ZA(K+H0<R?voU@(siiLb_x|i7JA9wg42zSWPKCMF-TgfV`(MC|lvq$~ zCE5kHcyhiLCue|(CcX(n-u1OAg6os1E*HUNvzZ#b5pv{(9Z`EFVJ667A+M+anexeH zh^xWAUszEqK1RvQc*sI8(O864mc+iP`p1Hnsg(7nCF|Z?!@~f>jE$D<6nz?v1c$q2><2>>_ zWcSXZ^R)^6o@jJ_*d!(YU0by|SO^oU1zd|Bl3jN45SKGr%GDl?l!7%|VTXj4MVI0b z=RLgcXc@Pye=c#^wqJH#8AlgM;654UC8y4MX34T(yl9kY+0uT-5h5A6NqbfP2{-oyM$~f^b5rB( zId8-gC^shF>a_~e=Xb~++?JmzIk95s21RG8+AjmF{3s}(3=*uO2LyKBqMdH*-(%q zR}q>sSyW#FQIS0|v3!6{9{;pcb#pp7tz119)O9BuMmAB%a`Qji^d891>*Kt;mhkq- zI4yjG+M`Q0(%O=2fN7BQC7;x51?}EI4gY`UOMezX#18oFC}d(o&cYE9WT&kYVfk(- zV??C*fF&9hKsKPM-lsPAZVQ{~1mRExUOgusLgUp6o#MTA-eFVMtxm_yWtUD|c_B_a zs7$12j1Uk0R6|MOGK~xeM(Lx(L#V>WaEU3evu(TWyvKN;+pqeZpQ*O^DV1C1H<*bklc$=X z&7JVVFNP~0!7afZ^)%B*9m7kqRUJ^{AL5~drKD53Qw3UIE=pq$n1*(3&;^6 zDyqHi9v&%$3VZGXS2AB4k%mUsV!yq6&J9BPNDE!e*?W}``|n;(^r(nSR#jPdma;;J zCpNi0mfGm{CA2rj;fKrYi#H*{%WIc)<@CGk`D2_;`$)I_24uSIU%CZqhx=uXk4V8XTZ^AR@e#zx2)@OP(# zs-q62b_Gl=>}mHc#hiv{Tb^O^`b?-zYkHIa1ic_S%V9#uZggzr*)tSXX+a+P@p}}x zL?#)xvbRHKesm&pMAQDNMM0b_8k@ms&Tw5P8NDSJ`=|&PF8ptp3&u6}fdzg}{jJ=> zeg}=ghSk_r;CO$VhUH@Zb9C&};J*vGu{!~)Rd#g;2#yH{4N zgF_GSjmprw^Jfa!T^~OdH|V>smgp@pQ>XpE;Ks#O25@6I!FI)nX5`h%e%L824uB!B zoJ7VnmuA-1AxW@Xk)k=CXg-s5`Lj{+c4E%D01WLEaPeDNEXwIa|6e9% z?BImp0=@kI`w``)P%t3YH<;5LOM7f@g2&)1)OEjfaPUX030`|)!q&d5iINfrqW0jD z-b*ZHin%u?Pids=9h4}snQgs#hz1!hq@_T|(l^Vf)nsC?FU~A;!-N7W^;G!{2O_2yv z5^FPp2}Ocw_2}Hb386ih3HTVnA8s|i`C@O`K|HqZXPlc4Pu$Vdwh6ot9lg8rTK(n& zniT|uSs4yz>Ui8gM*`p+Btbn8m9WXpHsTdyXTPlhK}jE)-Q4A$R?P2n)`a$aX- z`10*5@7hf`2lpUTjcm*~4u3VGDYEvIN$e^ALv~I2cFcALV&pL9<&kHzWDCv pimyXG$!R6)AdNBmWA6NN>P92g@PlJR@Gk62Dhe?9eA(x|{|8Ado$>$x diff --git a/docs/finn/index.rst b/docs/finn/index.rst index c13bf81cec..ab9cc96fb1 100644 --- a/docs/finn/index.rst +++ b/docs/finn/index.rst @@ -5,21 +5,21 @@ FINN Welcome to the FINN Read the Docs website! What is FINN? -============= +============== .. image:: img/finn-stack.png - :scale: 40% + :scale: 15% :align: center 'FINN' is colloquially used to refer to two separate but highly related things: -* The FINN **project**, which is an experimental framework from Xilinx Research Labs - to explore deep neural network inference on FPGAs. It specifically targets - quantized neural networks (QNNs), with emphasis on generating dataflow-style +* The FINN **project**, which is an experimental framework from AMD Research and + Advanced Development (RAD) to explore deep neural network inference on FPGAs. + It specifically targets quantized neural networks (QNNs), with emphasis on generating dataflow-style architectures customized for each network. The key components are illustrated in the figure above; including tools for training quantized neural networks (Brevitas), the FINN compiler, and the finn-hlslib - Vivado HLS library of FPGA components for QNNs. + Vitis HLS library of FPGA components for QNNs. Read more on the `FINN project homepage `_. * The FINN **compiler**, which this Read the Docs website is the documentation for. diff --git a/docs/finn/nw_prep.rst b/docs/finn/nw_prep.rst index 6fea992cf7..5b1d59b99d 100644 --- a/docs/finn/nw_prep.rst +++ b/docs/finn/nw_prep.rst @@ -32,19 +32,28 @@ The idea behind streamlining is to eliminate floating point operations in a mode After this transformation the ONNX model is streamlined and contains now custom nodes in addition to the standard nodes. At this point we can use the :ref:`verification` to simulate the model using Python and in the next step some of the nodes can be converted into HLS layers that correspond to finn_hlslib functions. -Convert to HLS Layers +Convert to HW Layers ===================== -In this step standard or custom layers are converted to HLS layers. HLS layers are layers that directly correspond to a finn-hlslib function call. For example pairs of binary XNORPopcountMatMul and MultiThreshold layers are converted to MatrixVectorActivation layers. The result is a model consisting of a mixture of HLS and non-HLS layers. For more details, see :py:mod:`finn.transformation.fpgadataflow.convert_to_hls_layers`. The MatrixVectorActivation layer can be implemented in three different modes, *const*, *decoupled* (see chapter :ref:`mem_mode`) and *external*. +In this step standard or custom layers are converted to HW layers. HW abstraction layers are abstract (placeholder) layers that can be either implemented in HLS or as an RTL module using FINN. These layers are abstraction layers that do not directly correspond to an HLS or Verilog implementation but they will be converted in either one later in the flow. + +The result is a model consisting of a mixture of HW and non-HW layers. For more details, see :py:mod:`finn.transformation.fpgadataflow.convert_to_hw_layers`. Dataflow Partitioning ===================== -In the next step the graph is split and the part consisting of HLS layers is further processed in the FINN flow. The parent graph containing the non-HLS layers remains. The PE and SIMD are set to 1 by default, so the result is a network of only HLS layers with maximum folding. The model can be verified using the *cppsim* simulation. It is a simulation using C++ and is described in more detail in chapter :ref:`verification`. +In the next step the graph is split and the part consisting of HW layers is further processed in the FINN flow. The parent graph containing the non-HW layers remains. + +Specialize Layers +===================== + +The network is converted to HW abstraction layers and we have excluded the non-HW layers to continue with the processing of the model. HW abstraction layers are abstract (placeholder) layers that can be either implemented in HLS or as an RTL module using FINN. In the next flow step, we convert each of these layers to either an HLS or RTL variant by calling the SpecializeLayers transformation. It is possible to let the FINN flow know a preference for the implementation style {"hls", "rtl"} and depending on the layer type this wish will be fulfilled or it will be set to a reasonable default. Folding ========= +The PE and SIMD are set to 1 by default, so the result is a network of only HLS/RTL layers with maximum folding. The HLS layers of the model can be verified using the *cppsim* simulation. It is a simulation using C++ and is described in more detail in chapter :ref:`verification`. + To adjust the folding, the values for PE and SIMD can be increased to achieve also an increase in the performance. The result can be verified using the same simulation flow as for the network with maximum folding (*cppsim* using C++), for details please have a look at chapter :ref:`verification`. -The result is a network of HLS layers with desired folding and it can be passed to :ref:`hw_build`. +The result is a network of HLS/RTL layers with desired folding and it can be passed to :ref:`hw_build`. diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst index b8a7f0d9e9..346eddb073 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst @@ -21,6 +21,14 @@ finn.custom\_op.fpgadataflow.fmpadding\_rtl :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.matrixvectoractivation\_rtl +--------------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl + :members: + :undoc-members: + :show-inheritance: + finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_rtl --------------------------------------------------------------- @@ -44,3 +52,11 @@ finn.custom\_op.fpgadataflow.thresholding\_rtl :members: :undoc-members: :show-inheritance: + +finn.custom\_op.fpgadataflow.vectorvectoractivation\_rtl +--------------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst index 1f4c9e495b..8dc7e1afc2 100644 --- a/docs/finn/source_code/finn.transformation.rst +++ b/docs/finn/source_code/finn.transformation.rst @@ -15,7 +15,7 @@ Submodules finn.transformation.streamline Transformation Passes -===================== +====================== Base Class ---------- @@ -116,7 +116,7 @@ qonnx.transformation.extract\_conv\_bias :show-inheritance: qonnx.transformation.extract\_quant\_scale\_zeropt ------------------------------------------------- +---------------------------------------------------- .. automodule:: qonnx.transformation.extract_quant_scale_zeropt :members: diff --git a/docs/finn/tutorials.rst b/docs/finn/tutorials.rst index 7ac54501cf..39d25c2634 100644 --- a/docs/finn/tutorials.rst +++ b/docs/finn/tutorials.rst @@ -16,7 +16,7 @@ The notebooks in this folder should give a basic insight into FINN, how to get s * This notebook can help you to learn how to create and manipulate a simple ONNX model, also by using FINN -* 1_brevitas_network_import +* 1_brevitas_network_import_via_QONNX * This notebook shows how to import a Brevitas network and prepare it for the FINN flow. @@ -47,6 +47,15 @@ The notebooks in this folder are more developer oriented. They should help you t * Explains the basics of FINN custom ops and how to define a new one. +* 3_folding + + * Describes the use of FINN parallelization parameters (PE & SIMD), also called folding factors, to efficiently optimize models so as to extract the maximum performance out of them. + +* 4_advanced_builder_settings + + * Provides a more detailed look into the FINN builder tool and explores different options to customize your FINN design. + + FINN Example FPGA Flow Using MNIST Numerals ============================================ diff --git a/docs/finn/verification.rst b/docs/finn/verification.rst index 4b1821aca1..578c941c36 100644 --- a/docs/finn/verification.rst +++ b/docs/finn/verification.rst @@ -5,17 +5,17 @@ Functional Verification *********************** .. image:: ../../notebooks/end2end_example/bnn-pynq/verification.svg - :scale: 70% + :scale: 40% :align: center This part of the flow is covered by the Jupyter notebook about the verification of a simple fully-connected network, which you can find in the `end2end notebook folder `_. -When the network is transformed it is important to verify the functionality to make sure the transformation did not change the behaviour of the model. There are multiple ways of verification that can be applied in different stages of the network inside FINN. All can be accessed using the execution function in module :py:mod:`finn.core.onnx_exec`. The execution happens in most cases node by node, which supports networks that have a mixture of standard ONNX nodes, custom nodes and HLS custom nodes. A single node can be executed using one or more of the following methods: +When the network is transformed it is important to verify the functionality to make sure the transformation did not change the behaviour of the model. There are multiple ways of verification that can be applied in different stages of the network inside FINN. All can be accessed using the execution function in module :py:mod:`finn.core.onnx_exec`. The execution happens in most cases node by node, which supports networks that have a mixture of standard ONNX nodes, custom nodes and HLS/RTL custom nodes. A single node can be executed using one or more of the following methods: Simulation using Python ======================= -This simulation can be used right after the :ref:`brevitas_export` or when the network does not contain any HLS custom nodes, so right after the streamlining transformations and before the nodes are converted into HLS layers. +This simulation can be used right after the :ref:`brevitas_export` or when the network does not contain any HLS/RTL custom nodes yet, so right after the streamlining transformations and before the nodes are specialized into HLS/RTL layers. Simulation using C++ ==================== @@ -26,7 +26,7 @@ This simulation can be used for a model containing several HLS custom operations Emulation using PyVerilator =========================== -The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files. +The emulation using PyVerilator can be used when IP blocks/RTL modules were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files. For debugging purposes, it's possible to generate .vcd trace files that show the value of external & internal signals as the emuation is running. To enable this: - for node-by-node rtlsim, set the `rtlsim_trace` attribute of each node of interest to either a file name for the vcd or `default` to use the node name as the filename. From fc6877b4a97122f3bd991356d98cb249027779a2 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Fri, 22 Mar 2024 17:30:06 +0000 Subject: [PATCH 275/291] [Thresholding RTL] Prepend dummy threshold for narrow range quantization --- .../fpgadataflow/rtl/thresholding_rtl.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index 3cbb2ba427..67b41d0165 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -168,9 +168,21 @@ def prepare_codegen_rtl_values(self, model): code_gen_dict = {} # TODO check for sortedness and size here? - # RTL component currently always expects 2^N-1 thresholds, but - # sometimes we have fewer due to e.g. narrow range quantization thresholds = model.get_initializer(self.onnx_node.input[1]) + bias = self.get_nodeattr("ActVal") # activation bias value + output_data_type = self.get_nodeattr("outputDataType") # output precision + input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision + o_bitwidth = DataType[output_data_type].bitwidth() + + # The RTL expects 2^N-1 thresholds, but narrow range quantization will result in + # one less threshold, prepending a dummy threshold and reducing bias by 1 to compensate. + expected_thresholds = 2**o_bitwidth - 1 + n_thres_steps = self.get_nodeattr("numSteps") + if expected_thresholds != n_thres_steps and DataType[input_data_type].signed() is not True: + min_val = np.amin(thresholds, axis=1) + thresholds = np.insert(thresholds, 0, min_val, axis=1) + bias = bias - 1 + # add dummy dimension as final dimension (that's what gets packed with next call) thresholds = np.expand_dims(thresholds, axis=-1) wdt = self.get_weight_datatype() @@ -184,12 +196,9 @@ def prepare_codegen_rtl_values(self, model): t_path = self.get_nodeattr("code_gen_dir_ipgen") pe = self.get_nodeattr("PE") - output_data_type = self.get_nodeattr("outputDataType") # output precision - o_bitwidth = DataType[output_data_type].bitwidth() num_channels = self.get_nodeattr("NumChannels") # number of channels # If a single threshold value is found, broadcast the value - n_thres_steps = self.get_nodeattr("numSteps") expected_shape = (num_channels, n_thres_steps) if t_packed.shape == (1, 1): t_packed = np.broadcast_to(t_packed, expected_shape) @@ -223,8 +232,6 @@ def prepare_codegen_rtl_values(self, model): code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] # Identify the module variables - input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision - bias = self.get_nodeattr("ActVal") # activation bias value i_bitwidth = DataType[input_data_type].bitwidth() code_gen_dict["$N$"] = [str(o_bitwidth)] # output precision - convert bitwidth to string From 85baad01cfb57cebc6a3b1109814651cdd422cb7 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 25 Mar 2024 09:51:09 +0000 Subject: [PATCH 276/291] [Test] Apply parallelism independent if it is HLS or RTL variant --- tests/fpgadataflow/test_depthwise_convolution.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py index 6ad8618981..bde5e918e3 100644 --- a/tests/fpgadataflow/test_depthwise_convolution.py +++ b/tests/fpgadataflow/test_depthwise_convolution.py @@ -56,7 +56,6 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -from finn.util.fpgadataflow import is_fpgadataflow_node def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): @@ -182,21 +181,16 @@ def test_depthwise_conv_hw_cppsim(act, pe, k, stride, padding): new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVectorVectorActivation()) - # for cppsim set all layers to preferred impl style = "hls" - for node in new_model.graph.node: - if is_fpgadataflow_node(node): - inst = getCustomOp(node) - inst.set_nodeattr("preferred_impl_style", "hls") new_model = new_model.transform(SpecializeLayers()) # set SIMD in ConvInputGen node and PE in VVAU node for n in new_model.graph.node: - if n.op_type == "ConvolutionInputGenerator_hls": + if n.op_type.startswith("ConvolutionInputGenerator"): convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", pe) - elif n.op_type == "VectorVectorActivation_hls": + elif n.op_type.startswith("VectorVectorActivation"): vvau_node = getCustomOp(n) vvau_node.set_nodeattr("PE", pe) new_model = new_model.transform(SetExecMode("cppsim")) @@ -234,13 +228,14 @@ def test_depthwise_conv_hw_rtlsim(act, pe, k, stride, padding): new_model = new_model.transform(InferVectorVectorActivation()) new_model = new_model.transform(SpecializeLayers()) + # set SIMD in ConvInputGen node and PE in VVAU node for n in new_model.graph.node: - if n.op_type == "ConvolutionInputGenerator_rtl": + if n.op_type.startswith("ConvolutionInputGenerator"): convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", pe) - elif n.op_type == "VectorVectorActivation_hls": + elif n.op_type.startswith("VectorVectorActivation"): vvau_node = getCustomOp(n) vvau_node.set_nodeattr("PE", pe) From 7b50f168a6b55c4ffe84fc1793ec1b77d3d903e1 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 25 Mar 2024 11:25:35 +0000 Subject: [PATCH 277/291] [Docs] Update manually written docs --- docs/finn/developers.rst | 31 +++++++--------- docs/finn/faq.rst | 21 +---------- docs/finn/img/repo-structure.png | Bin 83241 -> 83069 bytes docs/finn/internals.rst | 60 +++++++++++++------------------ 4 files changed, 37 insertions(+), 75 deletions(-) diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst index 1e1c48e2b5..3b182b8db8 100644 --- a/docs/finn/developers.rst +++ b/docs/finn/developers.rst @@ -10,7 +10,7 @@ Power users may also find this information useful. Prerequisites ================ -Before starting to do development on FINN it's a good idea to start +Before starting to do development on FINN it is a good idea to start with understanding the basics as a user. Going through all of the :ref:`tutorials` is strongly recommended if you haven't already done so. Additionally, please review the documentation available on :ref:`internals`. @@ -61,7 +61,7 @@ further detailed below: Docker images =============== -If you want to add new dependencies (packages, repos) to FINN it's +If you want to add new dependencies (packages, repos) to FINN it is important to understand how we handle this in Docker. The finn.dev image is built and launched as follows: @@ -70,7 +70,7 @@ The finn.dev image is built and launched as follows: 2. run-docker.sh launches the build of the Docker image with `docker build` (unless ``FINN_DOCKER_PREBUILT=1``). Docker image is built from docker/Dockerfile.finn using the following steps: - * Base: PyTorch dev image + * Base: Ubuntu 22.04 LTS image * Set up apt dependencies: apt-get install a few packages for verilator and * Set up pip dependencies: Python packages FINN depends on are listed in requirements.txt, which is copied into the container and pip-installed. Some additional packages (such as Jupyter and Netron) are also installed. * Install XRT deps, if needed: For Vitis builds we need to install the extra dependencies for XRT. This is only triggered if the image is built with the INSTALL_XRT_DEPS=1 argument. @@ -84,9 +84,9 @@ The finn.dev image is built and launched as follows: 4. Entrypoint script (docker/finn_entrypoint.sh) upon launching container performs the following: - * Source Vivado settings64.sh from specified path to make vivado and vivado_hls available. - * Download PYNQ board files into the finn root directory, unless they already exist. - * Source Vitits settings64.sh if Vitis is mounted. + * Source Vivado settings64.sh from specified path to make vivado and vitis_hls available. + * Download board files into the finn root directory, unless they already exist or ``FINN_SKIP_BOARD_FILES=1``. + * Source Vitis settings64.sh if Vitis is mounted. 5. Depending on the arguments to run-docker.sh a different application is launched. run-docker.sh notebook launches a Jupyter server for the tutorials, whereas run-docker.sh build_custom and run-docker.sh build_dataflow trigger a dataflow build (see documentation). Running without arguments yields an interactive shell. See run-docker.sh for other options. @@ -106,7 +106,7 @@ Linting We use a pre-commit hook to auto-format Python code and check for issues. See https://pre-commit.com/ for installation. Once you have pre-commit, you can install the hooks into your local clone of the FINN repo. -It's recommended to do this **on the host** and not inside the Docker container: +It is recommended to do this **on the host** and not inside the Docker container: :: @@ -119,7 +119,7 @@ you may have to fix it manually, then run `git commit` once again. The checks are configured in .pre-commit-config.yaml under the repo root. Testing -======= +======== Tests are vital to keep FINN running. All the FINN tests can be found at https://github.com/Xilinx/finn/tree/main/tests. These tests can be roughly grouped into three categories: @@ -132,7 +132,7 @@ These tests can be roughly grouped into three categories: Additionally, qonnx, brevitas and finn-hlslib also include their own test suites. The full FINN compiler test suite -(which will take several hours to run and require a PYNQ board) can be executed +(which will take several hours to run) can be executed by: :: @@ -146,7 +146,7 @@ requiring Vivado or as slow-running tests: bash ./run-docker.sh quicktest -When developing a new feature it's useful to be able to run just a single test, +When developing a new feature it is useful to be able to run just a single test, or a group of tests that e.g. share the same prefix. You can do this inside the Docker container from the FINN root directory as follows: @@ -178,16 +178,9 @@ FINN provides two types of documentation: * manually written documentation, like this page * autogenerated API docs from Sphinx -Everything is built using Sphinx, which is installed into the finn.dev -Docker image. You can build the documentation locally by running the following -inside the container: - -:: - - python setup.py docs +Everything is built using Sphinx. -You can view the generated documentation on build/html/index.html. -The documentation is also built online by readthedocs: +The documentation is built online by readthedocs: * finn.readthedocs.io contains the docs from the master branch * finn-dev.readthedocs.io contains the docs from the dev branch diff --git a/docs/finn/faq.rst b/docs/finn/faq.rst index ef4457f53a..70c2f24ed2 100644 --- a/docs/finn/faq.rst +++ b/docs/finn/faq.rst @@ -7,16 +7,6 @@ Frequently Asked Questions Can't find the answer to your question here? Check `FINN GitHub Discussions `_. -Can I install FINN out of the Docker container? - We do not support out of the Docker implementations at the moment. This is due - to the high complexity of the FINN project dependencies. - -Since FINN uses ONNX, can I compile any model from the ONNX Model Zoo to an FPGA accelerator? - The short answer is no. FINN uses ONNX in a specific (non-standard) way, including custom layer - types and quantization annotations. Networks must be first quantized using Brevitas and exported - to FINN-ONNX to be converted to FPGA accelerators. - - Can I install FINN out of the Docker container? We do not support out of the Docker implementations at the moment. This is due to the high complexity of the FINN project dependencies. @@ -52,7 +42,6 @@ What operating systems are supported by FINN? FINN should work fine under any Linux-based OS capable of running Vivado/Vitis, as long as you install Docker (``docker-ce``) on your machine. - I am getting DocNav and Model_Composer errors when launching the Docker image. We do not mount those particular directories into the Docker container because they are not used. The errors are Vivado related but you can safely ignore them. @@ -74,16 +63,8 @@ How can I target an arbitrary Xilinx FPGA without PYNQ support? Why does FINN-generated architectures need FIFOs between layers? See https://github.com/Xilinx/finn/discussions/383 -How do I tell FINN to utilize DSPs instead of LUTs for MAC operations in particular layers? - This is done with the ``resType="dsp"`` attribute on ``MatrixVectorActivation`` and ``Vector_Vector_Activate`` instances. - When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layers’ - folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`). - This is a good idea for layers with more weight/input act bits and high PE*SIMD. - See the `MobileNet-v1 build config for ZCU104 in finn-examples `_ for reference. - - How do I tell FINN to utilize a particular type of memory resource in particular layers? - This is done with the ``ram_style`` attribute. Check the particular ``HLSCustomOp`` attribute definition to see + This is done with the ``ram_style`` attribute. Check the particular ``HWCustomOp`` attribute definition to see which modes are supported (`example for MatrixVectorActivation `_). When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layers’ folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`). diff --git a/docs/finn/img/repo-structure.png b/docs/finn/img/repo-structure.png index 704e5e5bdab8d51d88f5a18893153b5c0827f755..05db9d201cd9930942a05a3b8185a613cba0c61b 100644 GIT binary patch literal 83069 zcmd?Qc{r5q|2J-*DU4CdU@)jqVutLyL`Bx_Qi-vLER)1w>}9eh%GhP^PNb|gc7_;R zglr?rP{=YEV|}i=Ki}v69LMjU-|;(s&tK0!*Ws9JTyvf8^E}_@`?b8Enwc1K9~C^x z#KgpX<+8p76B8SRiHYSDm;?AEOV#x(6H_SD6@A_7{x(aQM~H7N8SSF-38_gn?a8;C zM0|?DsZo(N@-CfxXKV9gub8_&GZH^0p1hVeZ^nDtlP~V-etHzmma<qj;P@k?G%j*%-`0HPPc%+ zm|i8~tfT8_>w^!>q9=aFoscZ@mv6hZ8UGsnjgK}c1On43wUa7r@Wknhnx1EpjTt`9VRiX) zh1=P+wTqpMY7Z&8dN|Nq2;-@p(s=lb`tR_frT{}{M(Mg5k_fb&{Ofu%Bnvzr)oQ9?8%hrHogXV{!^g{Vm}eiQ{>;2<{Pu>m^puNDS9O>#%TDzcTcyuxE0tDt- zWE~r;ojD!slgDJDtjto}aF}l;)wV|6eB>8RA!zR&lb+11_F1}{aoUF zf4qC_v0BlJ&9iBr;Hxnh9o#B$aPuOg!F?V_IHHa6T5KnmJ;w5)!@V1BTU%tMSnLfG z50p{Lr?XIH1KH`e$~(++pSt&MC#NpHL{5Y+C&#(3$qdT%?kdw_BTdt)`!lJVcbqTw zQ5>!^Q#*&rUCO97+mg!jJxVKgDinqt)btu3f=atr65o5j^&~OlUjERKOQ&Dyi3w<9 z%jFu6`Z!H^E2wx#Z`I8eW7tbJ;B%n_;jyah7VJDG8d)Vz| z%t64A8~oXX-dNu-%7tK~mw7xRLO=qTb^MRy`gWV`7`=Y4KGUD^5>MFmZw)GkS^ZFQ zQyWCXk-4I;hw!TUm%VHZ3BeL=CDi5<H@oCX%+zkRA=SLX9+jzE(E1IR z`1X4_2@*kJTT=pC-HVl&P^Q_4{phl3!PnIq&Ke7~ff$h9v38cq|! zt}gSX%*{EFS$j6@xk8UHZ>rGCZAsn1%aB;v8+2v zo3W~2(pe*p=7pP6>&RuMcg56NtfjXL$6zj#ev(S@ZhC=RF7Q6n@H}!CGrh2; z@AgWI*Y_M^!v%={la*Pg9>Wg9ySNGBax>Ig!wVG4ONPDYBi*L?!O`$%9{L8X&W<{` zz3kd6gYIt|bblX9#!(ed^&jU#cCJidTAxL9%g~woeYav=ImW z!Tf}|nNO@ov(p%VtWZJ!3@fnsSFb`#%%dVukdx$gZ=1V@vF}-(UfDi#x_A}maSTy% zarI>~iMfm5rMOpG!$V@GFBwSAFtJKEB6Dob%6R7YV;B4sAX}fiLNS-9tmCeel*Nxz zY8e+pqy${{g+P1GETtGDn_v<#b8)wC$V?DqlwOSz(BHGUr!Z=dqHX2iP)5a~?>Vn_ zn2fX%O;^WH@74Q@7{4B6o1I0+l5{KRqPec z`|aADc$n6%^{vzTUTep!cw7Cdygd?5el_N2Yc-3KqrI=N7 znxz&JA}hOP9&dX`!AS_}Gqe7{6;5mqI;$r=*wKR)Lc@cqMhw5{jVrkI)-7Cg`o_|F zPESq8_{lr&du8{EnqZ*hY*!wbBC`Y;W-zFko3+KxRgu))=5|EfzsOA}nWUS5y;Xb| zs-@fTsvbK@knMYFLeh05qmOEM$^IQd7-eBqge;`nb+DzUriex?@(iW&DF|bJxD};Wx zwn7%Aef(Xl6kZ(F&($*r?dShev9}@&2L)?1Gneny4^6ZjB{46PHC!S^^g!j+#*?-} zpxyL_?end(lho?}@RC=LCzO{tBWEv;J;ASm{Q;rt28Z6W*3xr%zN`ltO zyHKXAY=8{QQG$@t8aiZ+PZ*2}@M@ngqt&x_>bq(7GIALYc?~{EYfr~-coxU8a`8l- z>g8SBC?Fi#ghS7_NdaZhr)-9Txhy#xg$;3q8zYr(oU9BJ0D0GAHT&ZU94>@m#jgA| z36d>3L>q z`#=La2Z;TXgR50nOK4-adWl-ljS5slrVZ_5w%|;tQd{UZl`@)8k*VRG{l0OlDL*iQ zBY82Bc)&}$FIWW{lxz?l%{nj4DktDh5uxw%`-g0*OX1@~$dB=IT);mgGgG#ZjXSE;2pjkxWjnJ(kiNXjI)CnmN2g|f**hwnOG^{^1 zvDpWu7?x;DW|u#K6mJGkrRot_+a2UrGm=v`RzH@qv^q-#U@x2c*<#cFydp-r(i66-%~{nE9C(4#>U)(Bb#es=_vek)MP z_+6nQMVJX}o)uBQ`Zps`P5dI-%e`LZozRBF4ot5h=wwa?c>bPx7AT3;lFgrkESRIc z^gJbOiC12QMPKs_+ww+6rAc?=g=VH#vLZ@lE|HHUe^nMDLF8zo7~h}S^WU`w?Fc94 z>X97C?R!QCoP*~V`+w=&eSq6hUA}~~n1A=7(&Yylg2pr9vL9)a#=$~f!Q4Bny`S8) zOnzso0WlA8^H;?Y!4`gS)XzqhMj5CA2Z76UtYUw5e(*segJ1j)N^Eu4PrrO!>Ss>& zps-=Anj@U-kJK}MMB3n55klmJ#!sEFAM zd3Yb5{UY5$se&2haLKBVD%}u28tyzEdn@Zmz(X zLMm>Esl9Dnn5u?0w9;bTJMJd0Ka-< zO2tK-=w5ViTX2LUqHCQ)NKN+CUALJ~(-Wnxo-%`E2OGHw`3uD{)we$^Q?HfZh_00s zM)@nrD+h)jp#n!~^Yy8QiAH=t{-jlTF~isSji1#A&y#=q{h3XC*ucNj-olcbc;WDl zrL~=`f8h!6k9kXtXVYs$__UyTa7gr1EXCAP#@!R#r(FNyFz_VryV%__%kdY}^U}X6d&1YLe1}GBZ}`;n2?}jrsS=}L;{$aiNU6$5 zkRa3C)Fjxs5V|Md=0NRj+yC^GjD&bzS&PJCy^wOO<1eTCh+OIg+gEHRmD)aFYcD{c zOB9o}Pkh=JLh3z9O_tQ%;6!^(pPA4Ar%WHz^~>`-+r%^BnI8*Fj1#~Y7y9}53;aLt zF$qe(#o4AKl1KsFqPi+Zc5}(u>nMFXQOHLJnE68IjXz6 z_|XCwQH)v#@&}%K3OS;y#v70QK>K{D(FeZ&1he0Q55*VIl8Mmj=K@AgLr~j6`+|18kD)upmP~`ByrdpEPvG61&$cu z{n%LPXMVb!ZD^iY!7E@Ju2Q$Gl2#Zye z5)!WUe7K0DGL;D>pzNN*T_)lpOawqMIbtC5ma_7t&>h`pQ4IZk4ToT_Mil~D%PF$4 zq{(0GVi;Wp&4<$vy|d0t!@bjwzS+HUWum|I&_#B&MI{t?1kW9XH{ZG3MebZAF}HJ6 zVpg%V^)D{S^qP}sIQkSipsAiZfAn|uz`xG(t5HSDw&XIk^-7!Ca{ucUI~Map&is+FaYwt>Y^6ZIXwli3 zGcNF;8moCtjtZP4@b@zSCZq2XQcyru9_or1y!Nf^59iX#^k3DX7^tb^p1%j&+Dzm^ zEF!r*t?0W0`q^mwsuN%8gNMi7+rg3A+Q2X^F0*>IQzl2_Sjk|0^>3_TJvP66k%*iN zkYEQ{E2GQi+LfamFc9HrQz^9&JY5Yj_}=wsJEImT3x9A60z@C@K$w zHKPM`=HjO$jJJu8pM%?xU1@aziSe@F(&OvpFi^qw2X)wekTr0h33ggztU!9J<507i zI3ZzV_kg6!c+h&jTftHdr8<+d*WCPV=mIAaa_g&U!CBhFpPg?Eo;=rhohd8o^|{{khVf%pf&B0igZRB$QQ^Q0 zCdEjYZ0lbfBznHQwL?um+{D?|trwDZN&}ZEk4o&l*Xlsoo`cIEi0VfVN(EPE->63` zGVI=|uW9BFZ3L_DEdH`%FV}7g`lbD2oaiN{uhfNaR4yQBw_T&Mcpym3mdBwb1l}bF z5zTMhY%B~Mb;V$6$5vjMuNb87FCA?JO(4$~l|-dCkQ3xU-m=ZYiP0UUuj5!Pc3#iN zkO7c`5^LM0ik(NpSBXT<@W8hYlQ-07_wUc&z~cbdnx^LhIPqSRi)sQrbx-)eyfm$`H%(mg^q-cOyz^Ws~k>5a0-Cz1;fR>A7g)N zd6l=rt4s+ zgP#XV7w)NfZl$tgguzP!NS zr6IA|_%Qy#Sq_$+8$P;SHzCm8l9o?oc7-MT1K;paawI>w44cOIbVLo;&m^l9@~B?V z`-!6MAQ&dJ6ocKXlNOOb+_xB7x5`rG;%R)B3)jUh3vv-GqkVtYS+{TOJhCClY}aGI zbto|kD@~FXDz}#K=pw%16N`h3Y1q_Q7+2U--VoVaEXKjwXE^_3({zV31L$^a_>Q$*3Z@hkCsU9+- z{7Hv@^~X&#i&l(MY$+pjlJfbl)y-$$(1q21#ny}NP4_0n z?{-LP52$Pn8D=k!Vd&*JBN^4gm1_|1=1(hJ9E=0n_@y@cVBINh-8!TYtolLtsO|-i5I$T_T=LX z7{%lW|5LMoQAh6`e>xGKf)IZ&pXqf_&wlcDO?fP`ZvJ(OQ1kns_NH*o^Iz!H7dIEv zJ*vJ5g$HLxBrM%=`{5+iE*#t}ofIE<$Q(uQ*6h{58Ewf7bl8vV=1A-)nS^J|ytH-w zdd0rJL1q423cXvtt0ZnwVwCtkEfTgD1sXr5z*o+yE+HKqFo~*k@ zF;?gvRQ?_%cVVbSrP0`bwpQCj?bo1B!<+ zy9X`C_{H<@N2YO$4g11L@oS4AwQGCX>CJR=GmaJ6KU)>JIA0jmpPsb`kaXVEX6@qr zkumxXJAYb*2&qZAKG3BLNMjH0 z2|mewJNX&1_C@|xi;DG2ZbXfE{uk`|Z`$cI-z}x%<%Y0g@N9ohf7Doco0^=4dZE?4 zWvJnaMdJQ`hkd4UcSy0-VJu^e&RT!W+p4^0Y`DvGC)4Y#7moj*aNq@ADkZAa!iq7W zSIn|^xBQ5oP+!G>pchkIyh;4Bp7#?HF_9{k5}oMM1F`mCA_hJn?(d=_keq}gaL!d; zS$0t$IPV#_R9hvlwY{{m+p{-)I?na@%E*H;(q*mo<1i)+oWxAJh?V64w_hUZw1N^2 ztQ>YLkBhjML}AS8>4@<6a>NB8!LCXSl*;7vcXrCs(XBZ3-IzEB6qj>4E}o<`X{5W= zgptht>bL9ATRq|PATitapOq`t%JnDrat*2!56nLGBrPqk|Lltv4i4neq{nVA-?Ag&$QxhhT$YAWpacmh&;?cgk^pm^5j84n~wN! zEo?%@)45*?=%w1l)Oc4h_JCI%_`f}YxF9(A=;{BXGl>9h`;v^gI{q-do5M2GP6ApvfX^C zL_zb_o*wJ&Yq_lKN5#D>ay2f$H^TgnTIUmucgt z?etTJzsDRsO*nc#Te(c`#WcH5i`_0?9raAJtwlyg!jINE>3WZDd3p9ZC3m@-l2R=V z(VL4UzMbR>`vyMxS0T=NwCC)9Qq$eOGsh(P2Y~u^e?t2{c%NH&g6KgxvoiX(*z*}9 zu%CujIf|Xht|x}Gw|?piNweby-p5~#b%njStj*<^>dQk?)hi@q`?(%={4r(6#zDRP zgN+b~IG9KdtA1M-JjO3M`C;H!oC7w2wS8^L@l0~NH)M|bA#rXC$dsGE#0>y7;~BeI zGk#ColI(mfX{ys+d!=goR&*^*jnpJ(u_w_6!91%J0uATah_#WE;=PrO*H-w1u8*4s4FfOZ;>}muO(4S)b_njbliG~QAf^{pmks! zpHJm4&xoD8c-8s~km7_(>sv3@m5g6}>&`By_tG$G4F37ItopG3-RNqo4;~#`XdFZ3 z@;|F0?0+Jq<82#1VLz6+D_c*kGI*pD6o6S2_Dg>Vw^eF%l+wP{Q6$bgvi7_gN+DW? zxpLm599*W@fJFMYH6+@XiSzYIJ;Fz4(A43&((;wDYR&QOToy~dZEO1=93T;ZUw(Un zdhq=(iWiZbXnJ0$s?MkDO1nqJx07-6noBhp+|7v{vl-44up@B4FSl&(NmFLSeHLWw zM|2m2?X<$WMci)Z9zv zFb>&cR|s+-;Ob!vgI&Rx_$+ zXIMIq$^Fup*t6=9oXHImlU`mwPji=_n-!L}F~Qu7cCpZKTQ2+q?<1kYe;~t;ZWaFx zhPEPg%ja&7m~*y6PyNJ}>b)=5*==AQTl8NzovvD<)U%XR@*ZJM?~nPn>sUuw`Q=>` zTDdY+%iyxlQ0F5xH5uY664}is7uEf=w0;nQA==ugOh=kts5b!)wZ~xc^Yx@;qIOt{ zy1hv&Cg|gH`l)0mq06(hw$+wl;m!lPpL@I>kFp&c&9yN*BHzZ-JwKj;EMO{+1S($)};rp-9O^CW02m)u!}&n%K|Jp8g}Z z<5)ZONf*HkB*gzDIaZmve2zdr5yiiv#bKVAlsj;$| z2rz7`ulvx)X^DZ2Y%0|wAlS*9vrd3F;M|nYE9Dc5O)%PnmwCG&t2dZXtobn#^Qzue zdFjyJ44bSJ6uakz5M_O~O_P^w0ABZP?t)>9*f=-@{-A289@lRo5_SNPc>$CSSaKyG zgLK%PWs;aHkUBHn7TI#G`t{L?8WqE9u5hLmC^}z6jJyf34^PEd>hHK|OEAQqxqrO& zmz3B!CX|+C^t{FWtZG1c>r9bYmt`%4GvyME_d$iWk*c8>=rFqhE!E!qV%ayf~^ITyI0wSSVA1nP=RTyl1A)QT2$k?$Y%cDIhoas`$lUpOc?XnF-ZkkWtuFEp!$N+S|N);^z)P zymZaZjjBxDJ+b!n$a~S!_gm6Z{}e0_^8sqkzy`qui=HC79mGfO9FPC6*#1YLZ$;__9F}g9 zq$6pcP5x)mKL3~5gbqkMqMm&@d|w&7sq>$+`o+IPjwG0f?Pt~QuK6zIu1pv4Ih^zl^Kf{~zWb=%FoK$I*NwRMWm~n3;@Wl)Ysq5TYxs0-{a{x~rf4Mj; z1p1ss%?^f3uYUXd`mqw$$Hqpo?L=FB;?ZpN^V-a6$y|k{zZpCf>z9A|?!2sdi=~@h za6Oxp=@sc4C4NK>;1KcHBtc|65AIkMPomf~ye@hT2AyhC+$8--?zY@3nN(pPckVv; zm{z+W@?$)0dKK_y>D<-6~-t3Pw6_uYC4ZLdtmoLiZbjZP{_~u;N=ERn=rLdxS%LZL9ujl!r&pC){F?@Wjqj;*JjL|N?y>4fwfroHWbGXCczSSo7B=&%H)J^W zQBKx^`=D+Dpu!z$EZ!lP@q5?V&}PCPTWX|D`_pz)_LGzfcph|D-*(B1{y24OTO8E2 zLH9JfQQGq1g)i(`ZS#ljJ1dtq> zSkm^W{9jYm!V*Fhb=3;kIq$=~ecN0R zq@1r(H4@cfFx$4SAT6R`Iur73&j*j^`BD<%A-fk*mcMNr{i%ERtDGfq%zDJyBUocM zIK_TA2%?v?sZLWhoL^?9+p7-;i?x?o$fVFEWNOchK?x0}<##9je4-EMaDZIV9W>hL z)xFYQzS7(N1mhhT-EK)Fq+Hb5A`G+o()6QKEQp#b5g#n}q}y1jw;=O;yH{tX9BI6E zk*9L$?bNw?3Q+`lKhwX6aS+wLxM|}q2m+JJ=Jz#*MJ<++=we)liPJ*TSxRzazS2Dx z zg6&$yN@o3fQ&7q8jADTLFU3+O!vzpZ^4DomJMK-%fOA1HQibK1uCO>96@_9Vu!3C- z?(@FN;LjrzrNt;4;KTY48q!=O5k!@j@`%tG0wOk|cg{&4uJ<#Rm`zu-_PVB96~m~M z5MtpyVR=H!O}5XzmB2gJkQ%d%gqVEZg-IK%nm=mP-%e9BjNrw=pm4g4kKJezJr2bm zK#XE4HjVtzPJV0yHZp#mzApdl(&|%t%^WS%S-?O=V8%BhAzJrkZ5%eT5}mSQ=*HeM zy_;)9xu3h^L=h5;NBEHlG{iDW?GoNLbr=Of+KCUK%lJZixqEm30GQ2SDjF*x|a z(T}zsMs;!?6kRxku?+SKQ*jDY`<%Ad$}1RNK?B50mjdlXRc4@mR;+it))EmM_KMA+ zJDd07)2+EI0!RA%e(2|B&9IO0ua7!vqggVNhnXFCd{_aHgzK}t2YSz6tPmU(x)$;6 zw_c1lH~Ck<fO z0V~`F7=}&w-oUsN7&;Bp0k3_mQ${r?FNYFDY?orHF7naShBs5XuN~uhk&}dZ7g8?r zOuiwpHm9(jw*oMaLUD?XXlx2rz)H4>@WUC-wEwc-zca?}f#GD2{5J@*uWmLz*nH^x zx8oat<5oc-ASUOa+4{%~Qi{tq9KPbznIpYVqz4@YQtiHaGgCo&J*NvZd#rgm(4c6A>>K#pu{$7TpvM!{g2`^s4BPpTGcJ zEV*L#6z!oi;!`NETkDrNsva94mh}5ujbl2 zlKFi=LCFdXuS{W0e0nc8?&_lp{D5o_x{LUDwOBGz$QS=i&dyQQz+E-oeDYI?>qjm^oN2uGYd(Y%Sw*ogPtP#WcZq0XFY zt!Ks9APlsaKJ1&y@Xqp8g1kgEP)Kc680wkhW>p`~EU(pH4ODP5Jv_Vket2&j*Iks9 zjOZNh>av5IX@fV4)e%X|4v&MChXB;JgV+>oIKOQl?M0pYqLR~)*lkv&iF+CnNO-9i z(((}e@|`m!GLD>`@bPy!(<=3(Ki#5OiqGS>w;FZxCYN?>rvx%vIoqYd2++Xh#A)W1 z22_E+42m~qarq+>#!zy;mIIraL8A(=DrBA+B;XzMKk%*LMO&$rZAu8?;F8+ZMon9J zesLIc?A`-mwr+GBtATB{JSAbuUQ|!z%YneYR$tKB)=8?<#D$fXQ)1j9v5IjKUWAzL z%I+>W2d6}tqb)){wI|d&Xq*@XK3DqTL+waj7W6II2=p_lgahrkdx^&QDP#HQSHeB5 zrl6ntl1NP{p?NWRlwq@qf3~8n0hpkl4EnBen#9b}s84&1prsZqD|F9R1X%-+z}MIS zG!_~h_B!He=(D1r!Q%k;tV3d#*ds`er+l3IC}xE+sF6z*1&ng{_(wVW6PUx+bTS{x zLjuU`Th7?boq{EvSzlw(gOHs^wBTsh2;SxoeP?1vEgRSbgf>32(^MtX(^)InJ%9t= z4&Hd0+F-hKa{pz$FwVWCQV_IA5CXvzb<&gmf;MQs_qQo`zH8dCyKuNiLN(`7fOK2P zvHx8D_cTXKXn#lfSFJ(;1Qf{)9mZeB$Fdsq z^4?u?xpPD!#hSm|Iy}@6Dbd5682e5is&)Sw=KVu0NdL%@*5kV=a#ePY8Hp(_DE=z2 zCfgwQP{5L8cW=UL&ZyL3=DF=K#Wp1lko~GZjlXqm+;A2bn%an0WFhR|OiSqFh=n?U z#eRxZ-mo*qJ(0RZ?msz0zI4wziT}rh-SU7eOQdLc!fUY#*4|j&6RSbV+4Fi2Ngb#> zx`y^Q;U_pDR&D{-qzfZ3$OwKemqtF_S}& zFsyo-z*2@)0-8mph)Lcu9|SzVNamI>KAy!5LGm9|$Svz@GCQAW;Nd!3sMeQd7j9?eEn~p_w81{2W z+#)Z>GP3pX2S0jM(D~WC*KKs#$9_I0WFc6?Wgd5Rf4J9SW|tg1w~qJ+V_hZO>E^7z z+;Hk{lCPNl;j{H#jRntVX(n#k;@9v$xC^s0?j6J@SiWT({A2PGQ)|{Rv3d9e)2Q(! zz^3RwC=XCb%=`t!gD8Rjy8D0jXHZt8 zqLKVb6(#`l#q${zSRv$kDG&6M6fBl*d%;-(EM%;`GB`^9k_l076KvZt!`Me>7j9pz zb2ysXOY{);G{V?LPgszz!>e6@t|DUn2!~O6wld&`B@^!}M&)l`uEP*U)Y=%EB-zzH zuTtelcAxo_ivaqV)iS8em;Uhced_WD{kn_4qagkah$u`3u*U#>M@-Hp81v{hOBv^b zm45N_h8J|7iQEO==s>Q`ZIfi@T*gP|TbwVndTk|o6#8y(e%G?ZdO6+Vq@Q6kRUtFZOP`6{Gp-8<6hi(Te z=Tje(Fghoj)u`b<4@gbp(nCt3wJvOmi1g4!HtQHJP!171a-Q7F^YTi0a3%LE!FkjU z@oITl`(7pIorm;eIksmjimOKJbJWr00l zpB(yZ{|UAeCD9`GrH~y4I&PtnJq@%H*3L6`e&WRTzyl6<(8}F^ zR@=Du)K?h-FXHQ({>OBZ6VVal+o#XQDI z&jCHy+96Ak1qQ0vyh5b|%zS1n>w8%#o=tw^r*!kREdx-b>(U)2es%!hSfRdi*HW>ocv$P)lLZ*02LsrKe=^_( zRlJvgL9m@_f53&P)en?bjdsbYDVnupp=cnueeUbwc#{XutC2fZp0C52I*>llM@cgft*A?XP( zTLAofI=!&w3qX=+Iebn45;of?znv(lNtddp_8z^Ep3HB~UGbOkUn4`rN%5G7kBm2p z*~|)vP5_=ehMH;Z9P6`%u0|1t7j4Pipr2U>I@$;aCMhBMTh)p2De$E0Bo)RC18K0P z9@F6Ojs32xAHpc9y#)nuC&*iaqosx@`zLA7=-|znrV(5bU^dk2fq`(s-_U1janMU( zQ@~}X3gbO(otK1xB4(k0M!;PN~Y_dwbgitN`_V75nf0L{&(+U>^kk=*viZns%Tv5ToKd z?@u6NSsOQAjT@g{0Kl2VPKm-#O1DZ*N!_Kslcy7bwzgVbO9j&2fs+A?2b84S_>h~N z)biGq1uz}54IaTA8Tg(sl#$lot)fC1rR?iCLsKEh{=IMal#Th>_9v;gSO`Z4b(K{( z2GAJ{9}7`Cc(R;EYb-k8QllaY+pID*$vjt=WA$4wZ{>o7$N^ zL|@)htVnmMf9L^66n-1d#;hKp$#ExJ+h+o`EUn?5Y$=ip>Itp`{HXy$6DsJB%q%65 z{Rc@UuJG0SjksHKwi?@aXaSAabsoNO(z7=0QP~c~I_rFXK&9KY5@@wIfYhBpzdQ(d zwZyY#idT*qGy+|E5LnoX0bu@B917C42SkEqLUM5H*+1xupct*haf)dPFnHQIGt>rru@uoICZ%K+4s(G-z% z^v?psFBa%`tF%)D2pEO-R1ud#vk}uVtC0-wgCi@fI(?>3G4Jvi@T~_G{B}Zfz*@NP zsTI&I<=wb@gzyGp=??VN@%9Y*=-@tZhp7(zZhkwcph2*8N|1ZD^5^|)5x=9uqyTyq z@4w;*e`qh~lLt!FEF8Ugk>NUT0TXq;HWOM^+nFYJD_=`mZXX7)cbwE)+#4QK@X@qj z_6fi{$6;K2tdIlQ#)2#9d=e$k=Ou*-0t8=-rTYa5Ua`Fy#oTA2#e=?$i}Zn zcwm+SxE+zA!l-Xa)e_I8E%SqE8uEIlsCaPRXVn*nADJm-z zJG8Dx61_^e$tsO~KNLf>^t=U1cWG2b`<|)hDm14_A)OdD53tBo<%K)xFOk)Zx^CAA}mW(g*hUgC?QxQ1`kYbStHW;dTF6xK|nWvj4h&2&`#746aTd7YXq1b@Bcd~oG+*t>X!3nJU3Jb^JU z0BU|aw6}CByK9z~`tGUk!QQm@2lHrCs*{#ohYJMM!Y*A^tPD}0n66O+W~9RFPp&?_ zy7R-VMUinq3K#IMFQK|jIV~h>bDL-!lF&@WNPDr}`w%3J2iJd5{Kw>o3Y+Wy!QaJKzWXSy>4qJa-dz}1oT6Bo7S%Q}vP16wa$}8{t>yca zen^5S%FVRLIQ#}zyfJ@jZvntFEjpkrx0PtV`kZl(krKVUmpG9PHBc$Fdutz3%4fv4 z_|7p}HF~p+Y{&sab=Q(hg;e zV%!mNLjTqGqLa_PSs`Su@8#D1$xmmAGJK_62H)!)>tfzja;l>C;4*+FC2lv(K zyT;D#Tc#UV;+?eV-~EQ-viJ^_dX;YFjF?agLS8hubY96^1zOC9+yM_1fncBDCba+4 z@2e7k!0hz_=!x%da+8)d!Wc#;mE0w8OR}OMP}M^Pegwc;2CNoRQ$bDVsq?m_6-RvHpyQG!6x@=9>IB6fTr`u|X`@ysjR$JpWS)X#F0Mxwv`<`-`h|XlPn96P)Dz+&~iz~j)22YBXWhHz!1OBM1@`dOo?rT~ z!%}-+u9g&N^Rt58mxt4tRQlYAZ7SWc6tosQILAiYqG+$=c?}G^vi>A!Y4@i&!<>!s zM<^~fQFU0+hB#56-)=Z(f_F)S$KwQ!s!8Qwo|~@jsYJHbyLZ1fh8@?gbk^(_MMBZ0 zG?xSlbE@Hkl-fe9qG+kTtJJ%wUDS$QyQtCHDzwyStPuQ;zQ5n^ z=kw?9pZt-WSI#-ld7k^Z@9Vy<>+lt`LOIx%!t{+NZg6;UhwBtGdJW;$+LXxRiH0#{ z08xxAEIE#XymGYU1YQnn0TUFg;guZSxR>}n@em99&P!@6cIxoW)KsVC1xvxXlu1St zCH8b?ARixzlz5OYO@C8cn>KP`WiCXPCA&>V#qBf@)=DKf^!fPq;yrGZgSEg{gsWq} zXE>1Sz(;FD(7(`WZEASC7hfFl)qHL8J^(i8|H0F^j;wIbmC2;3F6J^6hpA2{x=z4FYsWhJRGbyOJ63 z+u+tkr?PaiiuVoaIJa=wA7;UyN(oKTrHCqdAgXgy6U5RhGenGxanb$@7<5&EsO%hIKd>8v32XG5#GR(IG&BiQOTlooAv} zq@sG3Gufb_8RiS;k(+|Va1SO?wU`hR+;6uHMXEcQVAkr#%mQEI_5rUGf#7bXS`I5l zUM7#IHSx4& z=J4VWMqZ+v+eGxb$hT1?JiewJ%J~WKe*+>F|tLTvp(0P z!>w336&1>8Ob0A4*1IYtMyr>1S0wO3R$r{Smv&VFH<0YwId-Yg)i$JWRD)wjNPw4k z*uC5`<9U+O5Cfpe=9sa|+52J&@eXUa1ZcLMJ)F}xNp zy<{{}uUwp7y@rp~c(u+e9_BssiA_ZTKrzd2#sig1(c5B62dL;C&Nimg}Itw}z}W+snwQ$MQHm?VH_&|jp`Aw<+IW76o&PV4z z-DEC(LARH|^+VSHx#h{ba7T&euXTAFhQit*bgsG{$$*e+t3-9JXU~mUoT+!~=!JxkAwWh{^w50}!??0K81VwwAe(+q;Q=`arzef-Td2e8m!%s;fPJProf~&NX)?bO>hSi@!)LSnp@-ez@DRRt8~w+ zM29mk`*Uf@3qT*cs<{ERy{{qfTMkT~lN7?upA2(}A4OcSQuK$-8~Ph>EUkt%tjD^M zX#WSZP>drd$yB2*`h4&TfPA)d4WqirHmJS!>Zn-{=uO!H4_o{?8(-X2rol17r`PN} zZEM$lo(x><5%&8$>9!-5*-8m9ksB+R$`bVq+u<$4C+-OhBXLk9P^-@ z$Y?i;=irVw6aXQte}1?o!{WvIXuz>J2av7>8{0!6yJF>)B}5q&D_2<$y0|3M!3d(Y zUv^!X1wlpk(}#+26h=fFiA78$O~`j~?Mt@GU8A&S zk+_7YP<0X=hokvTpXG`%s=u!3O@IeQ*XoDHXSGSw9D(Kx-1P5h*A zeko5M+*Y3qX(-YO~={7(OQ6yIs_(0ZUY*_QkBP{KQ(l-VsGD1Gn?n`W} z_ct~sh^qw40oivyJf_j=G9)bNI3iy?H|EQJ2cgAvo8&$`QuV z4bOrs=*2Hgysx9M%UwE<93lzPjW6GwVBRIMsLgZl+Qg-0c$E^lV8343|HF4A;S!nN z=?3kH40+?b-+Or8Mc0x(uaTB&DV9N@SFbU=nR%%x{w7rXsv$WOqV4dq<-dI@TAF{4 z((J}1<6y^;d);i1B5#rCcGo^yxZgJVwDjwnhqUeT|bQo|3bfY zeIbaf@KCBYFK`}v-v8;W;Q9aW{S@)vtH2k3e{yKM$oU{}mL&B=Dm{dyWzMoY9zXwZ zmEbY<>cv36JTvVNO<$U{kVtp@-{1;djUo2DqT223!q2B}b!H2e%Y?iM?0k-&*HPu| zn)%vtmKK6lp747)BZ{9-%uOkMxwPxQd>^y3qtRSRQ(T0sniKeok9LkDfXkI&9&kOq zg?nK*VCuY@NuViC`x;mi*kx@g-b(a?odHSA@~;>89zuF!Mo95g!8cjGWz5IPB%EHm zmxx8Fm_rT@BeC6Rt=Kb=z9b@e>8`}1n5!`-;@zZ~MwO|d+C#B)YBT8LJ089oexon>)BB&as6lWUt8!>;fI$kk(fYywaHniUSw9K*lwr z&qk4LrM`k)5W=wyYP{kk)c?)-Lknjmf1KfX?ev|7o{*OQfD&NBXG_f!nX^2gz09}= z`~AN3E3#fj4BAwasDIpHF!SK|vRx!V3t5iY`a zyD4c=(U05iD9&#}(~(1*f)AhpNrKacb`-s^E6jwSP*ym`qvEjzWfi69VEF)LFa`4ro^*W zM)fr^fp|aSa9{~g=0_N8B&UX^3R`Qr>!4-{Ie*p1FbUe4=l8mL8NSuJI`DMnvDt_Q z&cB|N6Ay7j7xn^q!obTVB6D$itb36>BCEI)iNX*Jx}gNmTXM^ojp(`Szdo=-plPkc z3mMkz=}=XDK4p!36^Th>E!Un| zhu1-4Ak27B9K}#`7wHaJ&uWm&qryYI<}akKSeT#2)6c`DCvTqz{wAv~h_CL$Qo|3T zPs?Ilbf63JpVb#;DWi{*1}&+nY?+(n#K2-kO?+ZW`5qsL0bIWJO^>6K2MB*NvHlzquBZf^tXvz4Nw#t_&$P6pTl85R-fj;lpXVuyT)mD1W2K%7iVc^ zivrGnQIF;+xcl_FR`5u<_l^G%GXoLyM&{c6I8vVZdgP_Qe*k^fth9%uvrOgnvzR)PF%W2&=uo)@H_Ki zY3|Y=|7GU(V!_mMxp;#)^l7@jN`8Y&t(!o)T-Wuz?n%r#kP3fcA5eX*ARn*e;oIz? zfZu2CA3yiQ97y}`X_UC##cY4>CL{J1DbwY@7w-s*>{2ZiO!T?w;qG;iC}AJ-$Y3Mg z`Zx23qN&1_Xh*Q+6BL9YGS)y?-;Lw#Iq6o=-%xnG(dlM`Xjjy)7azz%bSkYZ@beLN zu|cy_{Y{n$%%_>wd*_Qgnl0(S#(`CuP+ZU3^~gXt#d?DIPElG%V0e}@1Q2l54p5Bq z`mrOT*Lv2Td+Y-1mV4{T$5tAGdzR!4C)iPC+W@EA^)7AjYbDF~%96zAsES3{(4^ ztP1+LY;N}5#@mTfaCj?U6RXZuNN;ZVME1M3h}bax0Lb3VunHnLcpev~5v5T+H8Y#n zLn4ADW&H<_4devCd)l*OAMmcTvzXw?%q5YpK5OGqm{qeuQpR32=)}>+sg?UA)Mxfx zM+@eIY2Y+zEbY$4?66?nVZa8J4E2v=yM?c_r*6yZ9$v2J!1S?S3dT58RYH? zaK`>LlaDT4wL9eXQYiC5(P6QS_R7awI~p-jhDz2d z{{%88)69`M{0 zWLiOMWE|ylMEAMQ*q%DGWKOb#y=6~Bj?+yRTKIDPx&SvllN3`lRKA9XRmNDs?OJ+B zetfT+r0K5}A_c|3!?Eeud{pLk;7ovZ%$BKPqP``}62kxcCL4d6y}tO@H%x_X1k!MZ z<@y2sF&*NlTy_gPvn7W=5rh1GROZFKDEVSnr}U}?Fr6^xa;K}Fa6kHM_R1>mNz{tEQ)k3>`oPn&c5g}3?5^IBURH=L+uy* zRP;8sD1*Z1udd{xEi3`#lJxynE}oc1Dg7mb@>(}EC}Ey|Ps{Q!7a$NmOaDl9R;R~M zph^ou);#8hbG(#zke82^JBtT)0nh4X7oqC}XGe;KbBW z)k8)T53x=fUW)GGLuZRfn zt5SNm?(4eH@+C^exy<`%oJ{UdEF2|BD;!X_bvppSWO|c97E#R z0E^K#06iiOJizr!D5o0olB1hxrQn{gR-4ZA5^7fH_|852@aM|iJ*hvz3kT6wf+iBveOJ76U$Qt%uAx3JX^;I=gRnu@KCMXrDtT{GuY|x z{P5l+aI>d*w7KP;GM%8IUWDh3e%~@4Ul@Uk%L&MN_bJ}zeINQ~TbLE@QKbZAc3_Pw?FA6VNCE0)%*!Rfq=>mksbe*EPd7a_4Pj6_ZxhVHcUYP21oW4K z?OiV=BTju&08A?p)UEw7mCN&PC-<8|B{}lQ5^nz2P;Ycx1BTl^Sao!<1G$#|G+9+> zm}A63C}Z-cCjJc`h{S1}(H*@5=x|mwhPaBPLHDeNMZ@-H#_j$Lnx8_0QjhxfOVqCL zZGii<+4Z1g!x4#(?{mK#_OhuE@0%%M_>z)#hvml#s|=Ku?x=H4bGvjLdVs!?b)sl; zR`LhP#EcRKu;_Va)xQ2@#*Sk4XoFyO+fT{EbO5uJn~D#x-5t`cz*momItzsqr}?nD zj!PXstSs`VxaYyqI$33z6vP+_Fb~v>@x$r5%b0J-Lmdms&Hic-9-Zcg1Ivm7PMryW z@z$)hw1}HUo#YsFjI#RqYH{+4z0d5rG*?-)%L{dH~G zuCsT>eGK4}*F0r)Qfcgo<|b#wiA!vCAW^ih#baK0fTHh&YZRKNmdpYevnMPCB+>QG z6Cj)vf|e0lQ)K!E)R(jn0D=Ar$&T=HL80dF(%|GAOsM> zv}v+}*%(Ttp4^EE=9d-cj?e~KW-cY+4@_q=y4B1WZfYjH-0FSj+h;b;R|?oMqQ3JT ziL=3D(x}>cMafyLqF^^T#!}jvCFn6agI&)F?`r7qG93fT?tN%oyM}}1Lv#*FUuq7Nk&UgwGU9)uzIxCOhI?cp8KV@ zcsO`HVrOOFod(oPmrf*f(!T*bZOXzW!VIg=9szxYC(Mz1DBo4Zb?$MEJ}ZjT4 za^>94`jD3JN8OPFQNT*1f&UoJ45_Nj1kR2I5_bgsjbz-z2@VQ?^5HvVGJk2fve2BM zb52>fd>S%wM1G^MpXi;Uxt-X6V?=Do=jfj!qXr)t2kp#xsc%FBq8RlIX9lf4KFv%g zy)B6_T}9qD-UrzMe794p@d4Bz36lvss&3q^uA?x#v5Ycai@8AYh{HNH4*ad8RVCcP zjL)su4Sjj!k#X=hQZ@_OZw+pIi%tXTez(uBn5u9`1B1)ZU7l}m!@q2N`Xf1vx^|0S zV61$CgU!CLQ)Hr)a2kh=D075BXx{pM@H79iwYL6+1+Zr(Gqm=Xr0EcI@vpSp-o|2^-cMaeAdTkYROS6>;h)vl4-g`8+ zqev(Cf=z3umbZZ>ZQLhuyDeRSRhU5laalp@_suiSA5sN$z<`q^5)%28xyi5yHB{7; z&rmCX4or>2>oPc~kRt7L4ZhAvzwf(L^FK%6*UMq%i30Nrt(q3-``S`Z&(-TVh zREj#02=B|4k*K13;upjb0+RrYRvoVIsHIXOhkrY!!jlM0A|w~?nbF+6S!KIZbp^n_ z?96^ksm)gKAKAd(+xMw>5yEV0dQ`9MuH4jus6k%diNBzU8hs@dW_zU+6a6j^gCO2! zhBHF+moDP6s!xBB_J)3bD=EpD{&cW8w7I`|pfc=NH}>?=J9xgsQdgmI&ddc#q7;lygdrNm=T3omd z0?=s&)4A9O-~Ds!OSc%(XgERTOZoDOPU~HJUb&ChScF82yw9|phaBo77n%-)i@bb?7FroHEcxs+EGLtH z%X%?J4qV*4TE$v#i3#$2;p)5@5dXOW!J#K-HaVFVD?;M3f^h@UFb$C$cF%Z^t`#4D zNu8DkV)OI{f;FVLHuxf$Ly#87LT+%F@+p-)J2@&iAjoTs5>g3q#yD)p`E76CaN5s@ zp5nCMfyhB&isY&$$$~U$%-amgJ$JBA4RRbkUcfbq)V^;{Fw@A&xP@L@2y9bYxcUZ* zpoI&EVcSW!@G+-3s=#rzF!g;8x5Va z2Ugw7mKoxozw^kEsn&yktJk7(-cfcu5i}D@P0JO+DT{3DBduE#GA0?8AP1~!%99sY z`-mP4np4mGTVQ;UDNL^QWAFiD@0L0Kc)=pJopTbk!P~Cf)6A0ZY-^|imvWVz=WY$@ z%fK>uo_G#=hn@V%TFS7^-8o%R$(w_Fyvw~)wl|h%n}Lu-#AiDMlMYRDo{d8%uUHH4 z-rRi-l!;U*!L`ybWfDnjUGaBINYGcANf)E}!GrOS-&~8299hgBeDmSG96ZP=0+i>y ziK5)5EuSdJ1VfwB>Jj$}+j*wBARF8Q_|Kw!nm2(uz2JRv*wFPUS;c!#AU$_PJ_X)D z)}QV}*$KH*%b3wRrZ4Ye2Zqn9bZ$vT2dk@@Vb0;Myo;#BF7>)LD*PzF&(EU3I;)e$mQJ)~e76zlS=RP`vya?kDK zBu;zck;%%*-g~#kPha*ze^;zL^woVsqhS$w-N=Lf|IV2xNRe$b-Fq3iGjnI%E&ExE z|M06jaz!6V-?I0PbNp+iyK#W2mwE29(~pz$r|}W%QCtB}PLc^9-hFuR{axkkPypvt z>nwfJe)+IU5651^s^6WUW8KZ^Dc50a{JUW#g7Ay^Q953+9wWIS2}1S%;3O~!^Yh^` zntPC+0gX=ecN$e>)=HdXuapejp`JuLn4Q)1Td zfz}7Atd}H1PG3mVK4Iz6&ikOSdO?-(FRCqp2S(sQ1&$gjRNiFr^W*!{%6mblA6wsq zs8B;cDs4aQ_vy*@u(BcA$;~|c2HZv;=f>UVxsPfAgKaBSLP#B7pheuvOg>ZNb=9M2 z+CX+wn8d#EwW*LKB6wiy=NIc3qQLeTwWZjCv1~5!(o>Cq6H{C*7V}MHBRqz6;Znrge%xy;5aT zEd5VJ-Uh{aznLj>&ACpX%#=!B<@V5e8qZ-LplgKX!eI=qO?iIvnc+Y#%a*j5(0tyo zabKYMWgUixfXCDh({ne84>WaexES?$oe5eM=B2f?%J?4FRI;|yCwT8(w*pnqh+9aOf_wLMs3xxQVL%xzso==kRaXjz&tXip^UVjN`F-TJQJB3(1ZmZc}bgpQ} z`Lnw`JaXY*T)?f)IZV5TQ9;M3&oqkF7z^5CK9jCaeEa9wE28QSBYCD~5>fi*1#4jf z4&BPI?^}^e`AKSusY`(z=M;7_s&w(>BaU8l;!EE^5U~plyfK7$8sUecg+ww?1PD%H zaltNkJvPBO3m8CaRq>HzIS9VEv520h+y8+6o>~0IP`2(3tun5~HRk~YQpAxNq-M0q974Y3KOYb(uPl=4DWl;_ng@1nMGYu_x0r zVwHhGU5!H6Afc&{fng|F-082rf3JPXlarGv=yCL+1mUkf!-Oo>ugPN+Q^S|M7 z@#DK`VDf?01x^VAQSD+7DF{_+C^Q4wQ|TS@866_idiuuQ{Yhnf5aTxAKe(?r*V@Oj zI6OX7k?cx4S?krQt0VDCZ8=?aZNY%e)zdq?%ZOLh(BwUm?QA|CcRf@LI4E|RFOg?E z1UF|nz|i>veSB#Q@&izbY)f37nv7X-cd6uxr{0f~0o&!!gvhydbRuVpfC z7LiVOq1O(1E+R*r%CO3Rbn6>$M=w6MBV3wK3#0#JWQeT)-gGg;?n|#II&V~f1Hr2y zO6OxGare$~*92*{5Pe2lm*3!ya%afk^Ua#m4A_|610fE zT}5vp$Ff;OS@zpGAdJ{DDjXp=@$KtRL)@lN5!EZVstJWmYtbQ{5ALW6PPK|3&tjPg zCGKrLw)YErf+w2Fed;_h!I8Dp&_^qz z8~))5Pi*+tCbr-_I;VwWLtl7)Z1kHUFgpFN9#CDA&;ybG)AfpI;WUWJi;!UG3-9HL z=6|acv7=nzj_Mcx`NJ%vgi%_15z|6q40DN7*@7HFo)g^|ad3M0C4*k(JmIH8dB+{8 z&cE7mRd~DTvPs59rI8@mB7z}>DtOFQI~#?G4g-Zj$rQ+>bt)$$^kGH7JW-|J$Gjd6 z5XuZ*To4+3@M41@SbHB**iKf^W%E#FKOR5LcPP11J#n2QYs~B`>l@G)Sni0(vNORe zGs7KK)cYuQ<>lZ&H45Xzd!kOCZv6C|bpll4`hSB+5`){Lv%~V^{b8ov4~_W7Jw9E9 z;*jaabvLv56O3+Z@bTwnhJV~Dem*=QeISiC9-y4uGK~|1?ZSF(;MBAnmm{!lMp9i$>Y%22I$;hF znS#h5WBCW&3}{P(dmG3v*#>LNHt8VTLp@=EM4+#_oCSSWFzG#Wnv-!5XeTpfC_Dg9 ztJpG*P*4>a*)J||Ir&URJKI?F%wC|kHl7ll2(k6@;(lqTmR&r3bCc~PxkaDYXSJT&YlspSP ztdJ1|s|XciYhv7p>I$T-Vyj3*0DG!?HW3l{^u#JWggmg@5mQ11&0;Cr`HdLERMNnC z;cy4d zjFq6=#6>l^*D79~ix|l!Ym0}Ci5ynJGrBD6KihF$)(@O6(&Od0lS1ujjcE=u%3%SA z!&m;=saiV&IhWm6IZ}qA$WT}p_eTAjND5NWVLjifaad4evIbefhB+s7~AkG(QpXU4+Abr zx~9&IvMsr^R6g~C_`|ng1UQ=Pg1lR!DVzhGu09UNsq#X{O3*B7fzbyuh3Fo+M>2hF zTdqDnrD0xwD8XX#?vp+Cx1n&oQH0|=eJ^O9BtU@PdVBong)|d0`WPeM=k5rgQibZzY z5`u|^<5CrzB4113(mDA;Hx$;^d?SzJOKB;yXS$Te7UU)rTK)#<&hY&IT;t?Oh>E(C z?{#VMHS_Dz+s#Cz1Lw%IECf=W5>i%Hpd81BYCQgmKl#blii*=+`=@P+>F~(iOaI@E zP~G7FW<@l)MEv>bA`SjvenV<$fT0lX?K)dm zzr(>M%}%%!-IPJ@4SB-v`3zyp(LmD{A4V%vNm^0bI`>Ky7J8&?|JV-yCD?w7ZO(GV z2bg6zs{^9{Abrbg$97s6-?Q{Fi{RY#hiw4tM>&>NYm`C_wIwV3{t;W%Gf&P2-oV;H zDFXn#@jGbucg%gH(Kw0K)@$2%bU1A*N==cVtNll?%OrE9O@7kZSA*g2l=aa5&&YN& zLP{;^_HR}gma*x@L)J5)<@0p1Zx7icz~m3{RKA}t9okf6#&|d5PA_a|6o|COnHNZqsmJj9ORd88f}g#SIk# znz>ggFR6K95o^Fzq=kqr(5wy%{PA1wGamv-RsC5gd#NA?hoP{2 z03@6*hM0A)oh)$17?uy=fj*@1DV7QP zmv5xgj;ohGkIc~??O!IYUq+u*Pb8CE)sB>Lt{K7e5YVkdArC7I+hFWhKsHFqLJgSrosh!UTtC%2q7V*rf&Y9nd8oGPH|KPM1Etgr`WGGASc{2iRaGWbW z%$0XX&8Wdu}b?~GQm z{JqF)O8*yaBUt$pP44vvUcWr^wMz2^K@#48QT zWg(@=NP!$`iv!xAu|(kc;r3EvfA(I)l?xXU$vbY~xMW!r)>=>_cc)bhoRbJu4ZUzH zy&HXZ(}7*MloArZmt+UoiOUu*17`7^NenDkcLONvS>faM)W%%4iQ=d2vY-Dv%Z$Ad zE%-M+7UcaWgf2(#4S<&)4TABzcgSb)6X=iW2dVt%w@xT#dZQs+O(~j!{L$zlTM{P4mE63} zF0r#2n{7832ccw*E7^rH#96_uND%U88Z@V`>=HeTWctTlRiY4C&al2PSX&;j?cbCm zZ@ePD05~tt&ea2}S@^W1Eg zZPh{Mig0OAW#j-^Udzwx>eH;YuV^EC@i;Ci7Id#IdL{~n03pUVnhV3p({0CW< zi$>k{eTSUiyH(1Y@yF*f<-c=WLlfQUM9F5u>{>GhBJ0b%0J`ZJQe#oQz$~X zIu_S?_ZnF{ZY!2}v(a1;N#B#{930pu@wv(De@`9GN^w}euaD2A;4^B)Wy}lba+OQ4 z(8B3YFK;wXe8omC8`AgPXb|9DDQr@Yeh$qQA^Eqsr=?E^I;VVzbS|y^6TykBuULHi z2A#S}qlgh`GHa;-+h&&~i*a0GhklZ_oQjmo4Ee_s7ua<_&p9yo&sTJA-^2@MQ>dbt zC6adg^WZa}mZl?_RF(G$JE^!{*ft(vPx3y$K2~y)jTwzkxnF9SAWl^LtT=Guw5*6B z?4;Fn{Sh=R*MgB({rt+ZtSwyoETvSrc{&g6d(f@DEc8;^-=|B*Sn!KR#ayI>JPjfGymesGQR9A@b|(eTzutEw>>sm(%6KEfZWQe z5|Vo;mT;#>7s!~zfJG+ZQEAHwUt<*Sb~MRO73bEDP&46Ee#mtrLab@z$-O&UAHzhF zOKozwVf!t%Y!xSFxD+|Jy@B)|GwHV+q@3TL4ix1SrV_?l#=0~*33 z-%J5YRZ;WWCAXpPgOjyF{N4?Xennl*pZ}O47XtlD_omu*f*nMC+*OjL%2e}$OT2zz z2}`8WawliYYR`}L5kTXBI7MmsgY1MF-@4*fnC*^o3J{rU`dr04wAG1b$T$;?U+D<2 zQ_l1`Ywa=#l1QrEWU)>+@W>A8)_G34e0x};y-tQeSW5dH?mw~9qBId0#7r;1#7mWu zLi6rfIn}S=C*;b<#)G79SyxSaDxDsR9+TrI6;4Zyt7k}dIZm?b=F3s_B9q2ff-coC z(bq>kc^GWre%CYJcE9G6{rQ@rx}jSE#?IR0{L)?j%JuNH_`T^1lD-U688$vijZsTn1@@lu|Ks3#!^FUMIqvww{=cJc&Dto zq=heUaO0Qw1b^+&NT9wQZVimhO;2`A(7KmX`Dj;!WYVPhwT0JzCal>J(8(L`r3Zs8 zoqRg{MW%xAp}b_gtwfg!F_!PHCIh;%45RWYeMEag3Zd5n-$RTiPP#bk zB#@Kgwso<}OdEm(^eYsj!6Nz=Vfa_k-X$I|#>PN3fa3dHQ=DLO_Cmm!bq3+sxia1< zX5P?;^t4$9{B^DM3=+^5m}ox4x!qyYB}=-*9r(s5l^P1WWX6!iSGGB>X`;$VIxFyd zq`0r0cbilEomi`Vg=N^5(N@XX^tFQ!w~7x}2$1&V=krYB)ncSe^Ix*q6EBK4q+KI` z@>i}BoYvfMWVfcXnl^z^*>1oOeRka@zPNJ~C_gUUPm{SIZet?BoJ zTMYXrcL`M?rAFGu4IdFK^h1)ZMel5eBz}{(|M7Ksk?P70EcMR3d~d*hGvu4^tv?Li z4=28TlQoD-c28{g^K8kITh&$8TUv!}aXLk)!pNuacJe)Nl= z+|?5~craJfA}ySJ9w5Y5W*8YTDLz9dudH+0b{-HO&iM=*g@eO*Ol0#PpZ+wdF$pkEY4uk95-K9nk%c(tpCtN?XktvHRHKlmpmvDbWs%|S zz>1_8$2`%Zw2ug2`Q3N#i~wh3zV;o>FuzE7fgVGMF~2A)#W3D5gqir63sI~|173iM z)V`S)4^&fs;5yy3u%VahNk{i0u!CiLG^(a)(1PEHch~AE_B&*>66s)IK!F1;!k<|H z7{YcAFDyQADkBU9AHizBHf&qOre5RbfS%`j=KfW27QrDAJk_^W`@sNavKIl;)HE|u z`VJkI$^)>IG~00tO>Ml03+HR^eyX{t;N3GC_<+yri2QG#sm(X|`6lNbj&?cP1E+d3*s*Uw3EXxAk2B zCji64?@6#w6!dE9K&-=SH(#)UIv>w&Rueb{4UA0>FjmmK6quLq*lwnjii}z@t4n zZa-|sfr$3jrRvT4A>({|x0(%F$E)E5R6plQhC?SDzlTyp8lh?+wFkChPIV7?LGYQv z_V)JxCL4I&Ser%!H?7pAB=Wj2CfBhHcl(1(1UO#z5jdw^~T#QbTL5uj0gdVpLE-JL;u5P?sjYhIcU` zu4W&tWZaFwt_%jTm0VmOCBy@eG8;SW!A}{j>?SiE-NCaSW3K^_|#{u-h3sPuhW(ODYOjNzwBm~6sebGW z)~z(%OtKtnD&;uvC+IMQX@wn5DU&T?+MZ9C?cdRYkg=1wCHT#IlUv7Emo&VCYeM_v zrp=;_29&m&n$I`WWX`jmN=DB$ADmZGTk4U<_;g=jYt80zrU1tX^hgsJIlkkll| zy;88Hfa9mV!ZGuBjt;vjJqvnSPfbr&*Q#nqAD+Iz14o1Cqs+)_JCSK@1$BXp! zqHxlb*y>JwgTpBB4>uxL0y6Ax>#au9t+T9OJ;#dQ3GYh1?ZQ(!#n%KS6ID4L6kcyv zqu;yRY%1_KYHtc-B4Z*N)vt*AxsGq=Em_cSvP0mSV}AN<&mbkz`A0s@XXJk4tKdN`&)h7D z7aSk4$)VCD!s6kyEzTD7_3$u&YY@4Ikpe}z4n*@g{+ASYs z-5EfoWt=tKbJ)Wl{ZLqGg0;Otdqfi#l9Q{=9rY~vcwo!ElfRR9K_SnAEI&~p+I_Iw39=(X&4+=% ziiteWkJ2J1SS?bL2C?@o;*y-7QgX4o{&St9a3SgzJ{YH(G z+)_Y86Jf(gIq_1vY?{+bDbziYTH*YU>Z4Mq4XE|M{hW`X*1bb3a-~M=1%EfzJuxg~ zo9VRO?T|rrX?TkKYB(!9FxN-^9BC42eoi%tYJANz?fWoLO;$hQ7?EqeETwp_9k79n zrXmZvaVum)YRgbN_lpJLK5l2gmxW))xL+;K;xb;(>MC*kbpOHt|FG-Jj zAHnn`SVJ(SV2Dej&A|6Y|CzQamZ#b0I>v6!UB#K6rg=`mO`t$nVT#xhxEEv9m4LkrzkW7i{gacM@ z_D^d(ueL_a%)^df>1KOoqvg`NIUTkA%>m*K>vEIOls^+jIeHHVZ zu3hcZC8tiT+@A>iY$%-80LTteaYiCj$JoJ?vNsf;9&<4l135>Us6$I|sI}ir4njK2 z;btrn$F#mwC+MNtZ`bcAQM{6M@)ZDIuMSMEV5={n004J*FKqCEaIpnF?S5e8U0V;r z$@Viu3#0QVYkwXBIx`MF6sz#u?uRdA7qSGS-V#aapvC#ewALsrSBG{(3kJqX*=_sVQ8$oNpGoGa9km^` zQ&hjaT?kH4?jpT}TJ)XA{SEi5BX5aJZFK#qS3a-~KaM|+4SL#0xg6{n$~R}36*8D{ z@Q7OFpLENQjOF?DV%_RXCjXDU_Y8-td)tTg8Ey2M(UXvf9*kb1C6OQ`dWjlQ6FnF` zL>HnPf*^_BMIF(*5C+jAT9mfqnr*AwErc(^jVoAPyi+ty7%_nw{mNU360JS`Y8}Z7lY(mw)(UDW#!L`?5|h9 zzWn+F-N=>U`9g{xf-9hskt3DIcx|5;094LEBKlco#{iENnw#FxrUczCMyB4F+Tqw9 zT`Swt4^^-drXsj8y-0OjeHvU6MD49ChJRKW1e(i39d}wSk{?@FW)0U|^%k5$?yvP` zK(|@eLj0VM;8#&V&FuPV2i#`tVdA~|Zjs^k?aoDp7_YritLdzShJ(`6ng|DK)lQGg z48?-=oM2ch{dT-nJH#7aB@fglp{J;N{tOM`3apAr&ulCCqrGy)D{5GVwnXjs2t()< z^8-@i#9#jS5OX9$@>73k@WGxAN2|SsrI|=hc-XaQ$Iv^Wzwi03!#F;06Qt>x++5aKTsx^{0w(WFE;;e|1=oFt*%RFMGtL3`dK zuuC?Beewn|WJR!COk>de_tnD3I}3Sv34lYf%IoXj|07kRAET|Y@U zO>ov;X4LSoDxM&BD|zefiY`b#lJ^H_^FrM(pH*FEpE{1{N7fIF9P<`!7>a-1zNF^H zn%$kHna=YszrM5$wUi6+jq;a+1}f74JZeK$VQt`8VU1?dbok7s=u>$*{3?`+#w@`s z%`Dff*sQ{=L7`tt)sxa>0UX+tTfoB`%LsUM1jz)-cnv>>=R>9Tdndi80G&&~^~7}& z@c`@*)sYws0t+{>h={DwLBOik1P}I9lv%?XKZlx|YS-vgU$b`N5h@|!7@8g;K4A^u zVa9c2ZsLGuu_R*dU;L!`QQzhJxWObQI^??6?pk^26a*LQud))tu}V(6kxs!L zE-XuW(|W8?wA?}Y=_3)uBMdLdo=NtI)J|$HFBMvI@PY}y^#f6HC*hn2lt?Y$KJBz& zDOag`$Kh4{U7Nt6)Tpj6RzWhB)l&z0c(Wko#KAV{8*7Wtapwy|EiGVjXBsFAmk&K; z?}{C%$)|nNvJJFU`}O{K!jU(5nOl?quY)RoaaWo~%jQ08wLL$5O$7URDB5Vz?3tG7 z3LhARuUkWrjD!1R){$V4vI55WH0x9{u+|HQkCDAoI8&3p{zv(iOs5&_?46|PPE!JH z>#^XPjDyKG9c!Z-ismPHeL%e2P+T$CDgNw6wB}hBo)aR)LJHn^`T}$2^w}EWbcX4^ z%}uLj>AuJV?K<1aFFOLfDA%b-8=Q)TbyCeJr zW$~zW$CIS@zrTBX!L0;8S6;hF;S|`)Oon}6okoR zqlM(7mmguCZ1s^(9dclSm5@_p9_M>L&~mU>-H_t*iR(~;nfqB;35KfINxU?4bHe&k zrj1*q=~jgwrha|1rV6G#>D{8EHH6$-Yg&92;}{-x_rj?qKpssy3UHAV^!tY1TZC*~ zjrMfUY=RMIo94djwV7D}RQ{ zikbntrvQwnZIOtjL}hN8-Qc|S8SBgr{{&kOB)jAH;AhH}-d^@qinNd?H;O{?4!&G2 z%7dro^oPRCe}@P5rv7FmY7Zrg@^xMXY%072~Sqr^S zK?K3-!E*_Z*wA%Q5{ZSd8@8YymfCz!=bX|ksPm#NLUu!_hz?TkT>e~n-o1>w@IyOs zSEnxx541G1v_N~BWujer^kv^x8Ya3^<)+m#WjGJ8>{6Eh*{00>^Hr2F-``%w7GkNG3_{*r)cY-%lmEfSv`W5$b-lz$dGApyV_N!Gy7-?Bu zM18M3Yg}Np{xi}9yBMkl_p$h8dI-+0OjWRL_x{@5Bz~AFk1=yva#^K+b!>_sks7NO zO!@rhpAQH=GdX~FFwjyg88$#Vbn&3IM*feYYR9>eSR6ooO9}}mF5fvcgK@U;?I=0G zfZy~7((c6IT7d|&l;t;7y9Vv7?|P2Y0uvAEepyAj25c#VF2k9a667dFltE@z;3l@X zdv43&zN0yA&AlAFPGa|9*+e-whk4n`&uN&*c<#L6)@7xgmabKT81;4O)Fr#?8lZCj z@}OPfUC=J&F3m0jPtiNCAy7?0M2uCVdHcHS_(NFmm7&38hiPD0K+9SsJo|c6gOudF z7AN9>@Kerv_?dTITUHu*19{~vU_SGrr1%mLDE~dxjOD z#JRvTnnccs5capMIgSlUL~#MFRtAkE1gA|o!!JkQApYT1qo7s|#pw?M7y9V)FAd_jEjDjm*P?6S;4@k@ADN z&E+CGe&VDw?pr{xB=g)BqUf}tKn65}$&sssP8$Aw9T`{a)J-F()o{>xNN^GWr$o6` ztr0{QnoJz9@;rMVP|9~Q%VXT@M#sRrW$Z+G$_#)n=PvA`bRlpYZ$uByR|Co-FCh5LWOV3A97nZu{Ftim_y%cjc zBtb!Y&DGU)Hu%+Y6J<)p=y?N4gxN|l%ht*zeH**GU=x0#@q%$|+9sM-GXr z5L-O*XN=yacLb=GwgS48E>iiyA5kf2=sU)Eb%dgF8Jy`z|lV5{%wxb;BElrvy`hRi4&T zE*sA8{Ri^ycWG2N<)4?a`cGm=t+hKYIs?C6d=2cn=nCwL@Cawt{X!JWuLa_lP}7E9vIEs3@)B4a-3uL)#P0?@{x-j%dG}sn34|W< zi4@C%Wy4;?5$)?UP!r&GnTxD4jrg6~i~Rly6w^V;5I1#VwKlN()vZ^&_Zt4aCH7>sSU{sJ zC>b%j=@pjY{7Z9W-E=n~)TKN?oe*nKjGoaHkj_0{nSlSj^qp!!ZA?^n*qg|-Wg>1hy9ZFQ1~G(p;nr0!#5t*|7f zjYee93i}08{^3ME{a`O>+EEPSpo}o)J|jk-j|sBifM`m#-~0tcuG67qiMPTsh9v@- zhcE2|SV_yC7d)c$cCAbceL(K5Qb`>)Jlp;*eM1XIONXG8=kk51EfxewQk7&V>4a}| z5hB}?hCEsWuA#21g?Zu>QJmdhKw&%P+Fv0^gm94|)iK5jOXkmwst7A%bZ)g00iC&2 zZpFALac(E9o%!>i^rslPg{5eIpAHuwaZKAw)^c4{z@n}k<;|V%E-Hg!K*Fts3A$Da zJCXa{UrdM5T(}Y4(y+m>oRpz|I?P^Mxi>oLg~ZWr0|{?2eDD1zZ=kn1OHD{j*gANW z0U;=FdGN%JhnBj};LWz3B0{6($@4WfVB2faM0a<1<^5!@-pn92lK5JCL(*zy9G&xo zHR1({>us+41hf!~nzj8VZ?>zRA=ZfP2)XFZkf=^1Pq%!p-;`N6K$tsemIQIFHu@NE zu{`jWW)Zstcz84-5LTefV)ugtwlZk#b_?H(M$|)~Q(+El@Vp*NYx_@u&K6CP%O&~%}GMA+`3UP^A$YcESABf@Bm zYw8eG5f+$zB5o$gMt+AcXcKiM)_Cug*l-owXTEPP%T^CP#KOy|I`U$F&_Jbu9q2kP zH<$y9l?q52#gfoNy4=6n{FarZYr2K<>puq#U9{U(He@B-P@X0tx)4dHq#k6NtCFVx zIgtAQIRDHUz9a%e6I-E*VP9UzYJ;LG*buE*DUW?DXfrpiLf4~gU%KuxO#)PBPy8Z#Oevm`F_GE zsO;12CHMNrJ%>mKvb^m!NO%FP?a!%JkEpV5@I+W(X}Y~tSO40o$`W@nZv>Lwr$6v2TEh764m%!V;mQDu_sVmWdR1Jt0MF1dr~uC(@%t8@6sPuCiAZ9foiH z-T5af{+8VW&CBXjrT#;YY} zhPx|kP|si8u=qP6j}tj&1?Xqu5(?ek=Jze>Wu-r2>PD#o4cy-g_$AYidAojnR0mNY zuVaJ%cZcVjgV41w`6JS^2bJiopnK7T$>FS@fnM9NA#z*SM)eV&1J_245eFJe*ihJ!M(4e&#PjI672R616Y2 zMpG>^Nx zOoa+12{|-pgb6vd!532Nc4o#0yt41K?!{V;WUg!HBxUGRO8HdX>XZs_{7H;lyP2w2 z@k2)^YufMX6y|Ne_@#?f%|l1Wyom}Y7DFtb4tr#;%Zz#<(DLhk3klNT6IaObSF81O z&;nm~&*h9PhqRzLR&XM#BiZ&L@ozPH%@r>)MM${E72|Ru6h!{ShvrzTn&W&>-Y7DN z?Ce!#aVP-3WmpxH_zRx4#@v~+cJ@n}WJNPQ7u2gLiXub8T}QN$ZLHhVYhHV2RxGCao_Wo~hC>qe zu-tYs(;B1cHvn{!^3FjXu9Bi0^%OrjUbfad--2S+vx@fPn77*o@GEA*2WnWvNqLA*sI%>dv$HOtP^<*U&dk}j}kwthO|0gpcQ$XyQ_ zJ`dEj1Bc@6;Yyl(C0EG7nRKMyf@^dXh)R21|9rpu0gU3$F0PA~XMm1V{0po(LJ5%= zh!kv@AcO?GLB40xp_7Y~4s~(cHB}?2=H@LXYc2*>Roq(f8MIEIvs}q5)Qu0V4M_)X z&t%Rsl6TFdx{?PA&^6x#9Hs@3Ha8}()UC8kkD*g02=Wu7wv^;*FW|7f&E3MC2_#}| zY(0M^&UDn#)`+;?W*N6WrK5%$mTKyr*;hf21yhk%0m6#1)k%e<1bv^p@;2LV*roJc zS9)lWup;NSEGCGt?aDT*(}M&?YhE6}vl#g9V$RnUI?jTzMCeHQz-9`lQ&~QthH|s= z7rnGR^*>5e+cybn8v%Msu*V{hD?1dwPD&`F5-3N;2Tgp$Mg|6*y^l12FS`ral@S*;TM%$WUAr7+MgFPLwjRA%hDs*y(1k;H*gqd ztGHTiWtina#~m*k*QzvM0x)}Monfw}k#~_e4rc|-3O?7*o}}}^tfb9bAYJ~&%H8ZY zmeynWaz*-jzqoOA2f2U#m{6JD1n2HO@0Nfx1fz%nPDNfHdU` z0OVZY^Vu?gQ%E`%a_{QXX>An$dgRij zPc(>^9JYLZc>gNL9tDOa$Y4H#7J@${=gHhdECnL{!?1t@Uf^#r?73O{XU&|+)omu< zzsvK&lUv#3r;VC=))CwRn=c|;;ZHS?FJJ$7pC1;eF4ikQy!TJ^uuSA7>jHVqX2*zfC zzN8t|(EQO-UW!vz&NGqVx-Cjp=gE~P2=>xggoj#pg+>hba#?8;lTp~h?_gqSLQ!5r z5>!$WTqyJ_CnicVK`4PaT_5jLB{2CU1F(nUWO!I{Sk6;U?x6kv%SoY7^0U15plxv# z%ODb>@N5Iv61z1O8IkMn;BDqL8|nT{I>^RrRkAMSy~p5@h$_yFSHXFA6!aQW@R)A- z8E|AKjjvYDL9;|z26c6gA2!^iQa$YAtGsEr_>zR$=a28yFAYr6ck{7Kok3Jpa5ym! znGOI2naZ)ABqA>?sE^$%!@_cMO7Y=-#{^sb;l6Iw@yaQ+K~oBLW%KN;VA`k4nrlKU z(`2S!jC%XIAe}5n)~S|$8R+#e zmS6BU+_K!|){c{$pY)iuP zqS3Q|PGAj?%YX+5IHenmMGl>vwF=H_fyHhCPTu-tIk4M52FrpLTX1NZ_HF+g@kDKg zD|!cfdIm%x#)oBDjLx{V>#9-0mxi<3>F@mZ)ISb8XN$p_Z=3$3#3j`q~^h&zBhBeP}=y+*;Os7xx!dE_(eFu{>z?%TOhXd=$F2c)A0W5YzJA z!jhTRHNBy=4o`9i1fr2=71QT|eg0LW=L-F2k1j)t{M^M%{EF}uO)1P!O=A4bn**^D zBPae&D~85(E-6T-{9Y3u@3^L z|ML*|5W#vyhxGrmAOG8>|1)6!J4*lBk|%6C(`~`h(mUcEyU4GZ_4ESx=263>p}EKW zpY`&__q8q!JAhWlB<#3t$o_3S%Veox}4Eeq|~lC9)FuzqjO zd(O2D-``V3b27C5!K+EAZL<1k)zGA=Hh^0WR~mEW4Bu(!>NaGFF9We-BC&|^^maT* zpB;0lzOvacFCL#JYitnz-#*M{JMak9E%L5^+EiR>t5Y{tav|vjF4OIYVXi--VN#@h z+4O_UXWK~+pjPqUFFf+AeYRuKAF?dh?@D;o{r0k;t)%teG#48zA6%MU_=KSHxC3;w z$AzDb1rT&X{@Y}+lNzVKytIpMMZGmwX6CX>xun9WYE#`48Gp};JaC~=n)9V|Ygw(2 zBlr{IU@1Zl(s_8QZ-ZN251L8GA8^n}KEk=RY&CxfT)~r+M zXPV1@U;2Z~oH4*a#1{vZ2W{4)ZyzHM0_MCs3GG&;*UeAg{b6xyYrxI3z~`&s0CIh- zE*($VesWY^y(llIG6Z|nj?#d+HcF83UX9JjM~Zc2}o@V_gh z#N%e?ix=VBy+{3{EmP^ZH6Qp`BUeSM-OG!9Lp-W_-UohCqhZKb)Ren%;Mt$nHsf1V zI5&^m@QAB+5Be}j-exqi*g{qC6etw(M9v)mKKsP>6zbCL!`hG{u+5yDc|cZ5<(_Ryb>G0)XHH>2q9e|HPqh`gBF`GV$& zGk6Ba7fFvUw9h-cz4BOTi%W22^$X{Wd6EN4A?$+TL zpR2~3AIp>^<)phd46twARs*&=ov;0g0;a3*bczpY8@i?XJu3Xm!B(~f!%E-4^=@tW zvm>GU`Kqdb#XAYRi~ImY3g`vtTx1VcxNT#)JVad1`KmX5-cKD!G(BGj*;o}<>kNBD(y!`^H!L3}G z42g|)?@(y{Iq+F>_X>h7ieVcacExSHouiYmS0w;;d`S7bLGeWU;^$VI-levXG<-C$ z%iC6x_Jmuwl6tUy&M%<{H_~_#^TC5%s6c*gh7?5VuZX&Wgg?5)Vw3yNE9s!6c{X1B zPKj*WpVLzfUk?VEoly$cYF(5hkGjMPs_g|%1TH$=T36`tC2{uuK8^l_z15_>qPU8% zn^}m2en;_V1Ua(NQZ@S6Tvxsc^&Eh~FmIjM2YrEzV{nHG8A-mvnf0DkJ>OZ#&NJF2 zv(v@p4gFCk@b9Q|zd+B4=vS9o&|L-st!k0a3-4Dimie66;;Vrb1wOoRv%8j_KIKv) zDTobtJ#U|9a5~f^1e8_k#bDHx+zh~`@fiS--{53Hy9!jMqw;SnsdiKP$=0%SB za-=p}69Wb~LV%;&>c#8s5)vS6ulu|7Omf+r@m>_e35t`$8S?eH`^b?xK^Lu}r zTkef3ed(fo?_HtxD=r1vNSyBZh0ng>6Q04qS)rIt!WkmRCr}MNrV*yN5Q^WUZf$g(q|ClcV+*l)7pUPdetVebqro5$n$ z+jXjF-0*LD>h*rzf%Wy$Bf-II=^i7vQ-HwyGSgL)nQ#y@+T2tANM2;4C^11(d{$#- zNCE;OKc27(sx1oU?oqs!>pht7o=Aon|LHa2D>LM=n0Sy&Nrh>@;5breHf`B=9iT?a z29CA`?U>k{p-4krgQ0vo^tV};xuvTieZr`KEDrWLgx6+W+-ge5=_#{OW!n}!QtFs4 zA~8v7AnC;DGFs;NYibj`&;aWM;(P7I@4}{rfLE&=W7o#{-bf<$}0N9mCs429oY zxp%gdTkJKx*cb$q-=7pa2c4I!vPyQ!KU?;%eA1ZsIXEc)*c={Y_s}hKorPy$Mn5 zg-I@Rv#+k^O8#;A_GtiTeSgR*f|x9J0PElN#zA^&9DUnTn$adVB1m2G#;ND+*r25w zgZx;KuR$j-BAlZx7T-3a18J~i3Fk&NgQyW}eC z6rueW(QTh}sqZQ7q^d9Hi`qjIG= zyr0*eV-Pmld7eRGbP(SsmCl!1UtqAUu)DC>{U|ZoZ-H;$t6`OTVDDpy=Mue8yH2-& z&@JOX?V;cr)U$e%3377CSElen_v16FUmBDY#2n&EM$5rsYzHe7pm&TD=a`pwB;%*i zPxaIRuS&gQQFJZY6?`iDWkGjD0Gb%BP3ds%`TRKv2?^%Ktpryv=*K!0Nv~CGZRX2_ zwUE(gSL|H9YD!rDG4u}ZufUZ!_lj<$iX1s;uxt;ZQE#yj5wd^p`0TJMu1fjaYCu`Y zSq>5+;~h|}zI7F9=Ka+r&JV%zBP=S)FYQ{7u$3u^=GrNVpq*C38#^mG#gOcZt<)DS zll_-|@0rf#HCclzfE?6fIl^zvnEl50&Eg8y8*O^Wtq|uSKv*pAi2uR?tYV#NZ0{2A zm{(+4H*Q586m#zp5VEmh1yo?>3_r!&%#?#wWC{Rfq!Ia#J)6PB#cU5Fb0WyCw{O9x@^h3!eiEsBq^-^eCW zFLZCa;GmQir$^aWb`tXSNIbhDq;>AWsKh^09^qZmv!5TPeho0$+St*X2QzBoHdAxY zYH~6Rx8-NNT)nnFFc7_YW|a+6q%Fl z6?hs~b5@U7)_*)Xdl_S_NJqnu74bihs&By`C0T|0we=(Lh~O+0;uZPL!~nR^Vm$oa zw#{=#9LcV;&+7Ei|566Wyh-?Y`U4)B`sXloyu z4(~gOEHG2V39x<}P3Z>>@BK zR<@>yD*+)qO0Et6=%)S;jKs6lSlb6+gBm*&w7t}fxuefu5vc*)8WFF+Jl;5JAc4O8dYk7l~nc4A%n# zU|!ES8lCF-@7QDbItcmdAOZm1zwVzB0dM!x{x+`rgU*!2@oTmwmu4I0kQxBg)c_v_ zld&iY5P{4@GV82e!e3%L0RkwmQ`O7sqCSb*h_}9|3`_RoQLEE6B<1P3E&<9?u>7k% zqeuEGR3QV%ff_LXqmu?)z=QE~;`nS;{?A?JZTv8Q1@()TZ#ZPJ~4YbLcR`NOYZ*OU>KW)=*4+Xpjl9ZR|@9(@(iNC>bm(guhaq_85 z*l^}w&WN>hm0RTka2{}zF82eR?W*7NWV#C->5C# z{>AT~#QQ%jl|i=$F0a()`~8Vq&1>B1&E;j!mLDJyH*pj|zkqyN)bp3|o0NAi{{h~1 zE3;ub{HO8s#+GZAmyk}>8>BpPO=r!Ay#ATE$lr!Pb!T@ku7(EQG#ASmVs=io&P0k! zKbU%O)w=Z|P2}7X`!6X-y-7pS;)IY6c`xZAV8d--EZm0vc zHhqI7o8!A2VR$@My;B=NDftrhlpx+|93dLX!F!PO+AnY1>NUb$fa*Amjook zq28odKnjJwjC@aj_cof0;66#S3<@23p@6b=g46q_S+S5d545aei;+M z#~x(N8vK`;jPO~sO;jNp$bk`)vc-i&h_Q(Bnr2z*2T>lsv$CYB_o!BUYY)BUxVsn@ zdNI@@+=wW_djkp=B@P1@RaR*rxpFVB!8&Mv1o6P39ma`O<+YQ~vGqJ5a)c6)cAXw> z))QgcHtjy1T(sz}PafI|_K)*;zp&!SOf@^gv z(WGa0s$3NW8!Q;yT`jdS&6wd{W^S$4d3jNi(3?c4z-TWZXgqV1U?G6BZYNr~NzzP$ ztCx1x0=KKMSzNh0!LK?uz9t-z`qvsYMElLRj+#+5-A1j)c%|&0AyXx30#2Q=2}2ce znSL_lzi;rb@Sa;%N}fKw0xl$XrF#HAP+|i_Z`Re6{6ihMNfF%aPYJ=+z=`V*krom( z@#Ntd4qPO-lsA}0LDZkMvb8hjIS$miWdVI?X-MdYn;L(@zmKqD$iC`fGyQ4U!dEo|QhAmT!=F zGMYr2KN~PXHtpbF%ME*|9`@{W(G@Sc(3#<6KeE;2E(Khy2@>w;UlAFP~5SmiR3in9DPyHcpU!opOb^J3ts3P?An|K9_bW*OtLc z*F{I<&5a2gieWTa0P#UZ<1G2GByzU_YA;Ng)~`-f?MeXD^n71t=4zCfdy!w0$EA)` z2qwyyIheIWyxb?8IJ{m<2!$T5XaZYP<8o~+${FM99@cALIu|Cs3(fmrJ@aote|mXq z!;3;yYFV>=7)&Z;m=?aRY$%6?T`tSmBm3rGap=8m;qBYhwHl82KA%B`)@$E(cf*n1TtOnt-5 zr_MR!AJ?jyl1F-$JH?*&jR-g5Ss<($0~12inY05dKr401=L3u$0HY&&%In*j4Ksm? z`Az5#Q*BKEE24cykD|BI1ecH_$om|tKm#fS)WEoyWXcGaEhV9>b)E+f%Tj`}4TZ#r zW)(4QSTT>#l*3EGjq8td%4td3L0JtwN_5U))>(qle&Iv|s%#!WxsFyky>p~3r&f-e z<}kp)=u{7{<}uRcA{m3?E2gNdiVb0~=JeMFu#Pmp=_>=1)XUr!Iypu&=dUygUiHr= zTmzol(@@1g#VY33Qw!m8>hQ3vWG~MGU!e?8mt^&W>&AZWYX23U0JWuF*ZQu+jJ?#B zRUjk}xzBmkI~gch&4n>{tzK4^yxz2^v<0@&P!7J|!AabVu;vh|BKdkq9tkk&N%Rh> zBMo4%u@*_+Dr?2gbV8_HJw3w^otC%&N7qoOdd#V3Y?xfAxf?*GvgK5+1~dwoA+usL z0iqp!Dr>#!?u4Na1{ep`Y2!{*U#n3mqpPNv|6B}XRj?`Eoydr_z%82w+AmRw!nv*t zK=A~7G@%dh*?e#LuArJm72lvIrUWcb?LG*S5I-S8N={Ec1B%5o2+Wn@2MrkkQHD^+ znL=s0=Cj7W%$4T=BY2vdcZj)Lel&|^C|4m_`?8E0Q2vln1fq?S+xXFioDS2skGXiu*F_YF%$oRW`lxtvIKh zRyOGBM@6=Ahg|82b$F+izh>e3f&iWh57f;aXnjo(6Pla_h5^qu^B^QihkL2pe(B(s z#Buy}sQk5WCr7!!Y_;qK-(+J+VjGRU-LVxA>(+Zdt5vYOlB7r55Dm~y#FmMZ5dKM! zZ$^PwnyxBxP(!SCK_A80o9m(@s$N$TMR@rTau=z>?So6NwxWeZQr+-J)1fF3SYiXvUr&(=mJnV!JZV zBFA@Nw1ldSMnh991?~i@?`&d%UnB(lE(&gP4>r|YYj^0zTQK_Rj3zKcGK(BD*J=wR z!ipS6u9h=9(vp{PMi7fyvn9qZxy62#QE4{!bJUY76=9zQo$yrZ;@)GV^}&IBL5Ph{>*@^M#S z?xVziJ%``5^de2d{YeVB&$s+$l@`$2Yx2VZ5U6?0Xd4s8LC{rBe~?*M_zRLw)!v&d+RDQxcwjV{I77NR{LGs$>5Tt|2jD{4m^nP$7M3Le67f-aL!~~&;Au6nK*|B6QvvF6c0RJy{r{QU-nSOQLM8hmx7T_0OaQiygvD zYZw&lSfgA~o{U3mbQ2oZ$ZE$#mgd)A=@Ne!$on#wHZ&~cek^CRYvJ@yx}>ne^Vye8 zy)XS<{W4G^dE45tQP-L5o$M8cGn-<)_9XXHj#c` zCEC4hI?6Yc$EkT+efY6I??ssR9hi|C=T|EsJ`K3!!Hz8J-Wor`GC4^%Zo!1OjxnD; ziZTnvC^I$;_YkK56*13#bhlEQ+u^4tG(m)yyb1?}?*x3}hV@dvf^{VGB>4K-j zNpu3d(qK_ZlRXhN+%%)?x90=dnOmolN!ZRw5ud2@1TBbN2k!;1>Rwh7FLl+ulwSFt z%5b^*&DVt&%wMH0CeC|1q`tWeJRwRI|HGf;x0Q|H<6Dt>FmZdhKip@Rk56;a%{z^u zU*AP=VsA%kR-G(R{=4YUw&RO8zc%;t^~Zi>$b@3{>NRB_nH zzsmowjY>Y!~R%3 z_~SSy08MXBt2*DAaI<-`5xty#^Tz49Y0@b#Z?KWFvKS}w9dDd3QKCmu&-EtGtfNCj)aq%k2gC5)e$~xHNm~fls z-I2VOuu`JEU%vWsakJTp-y%oi!ZlMr?&kF60FRs`B#or5s5d_sf%;UK3oFT4`cdSS zeVDIHuKMcD5P#36bcZS1wA?8N^Rl9G_oZKNz$#??di&LHO+L5blE>3v=Y*~It1W!@rHVefeaS=M4M8k#c70BA56Y0+zH+Q9bK)*Gf9>VBO{p-ck z>|Fy`u#6WM`;2->Qvb;Uqxk4p`+C4C6*0{#Wub4K&F1ZF6J05PhBv$w7juF*fS~BW zj1~l@i2$jqsk$_8Rl3mKcw5q^x!mDP*hM|(^fcJ^=z-c_xdG)Gie&tirPGNE!GbEmxuDJNEdxxV}!|4EPl@8LpG3Xp}kNc=Wr zIn$D@32i;wb{9tb!5w$lI<~qbGvU))(noPVQ28K(-gx;BSZf6TV50u-6g#sQf2?@^ z>GXA=hCr=U2kGEy{RH}=(?8ooSz2pHWB%G_7XkK1%^UfBK63ow-s}$jmYN!d7OWWw zoxf#dYZ+?IeH~oRy8FCWoB8I%spMTC<7xSlbv`j-Kb3En1S&x{uhNIeYOF2@ibp?g z-W+?`G+X(=wg@i~To<&!oE1Urk~Xa->s5%MOFA7T6vyez)>U}1bxZPbTtCpxie&is z4O`hKf6XZ1hPV*^Ms=XG<<28K2sB>9TOh*`(}3@&1-KmUH`~Mx($h5RC8T-WMiQL) z4HZ)!te6z;OCoIO7sePKI&AQ8_M$EP0V<2qx5-nVLOqSC)Nx-p|btH}akbe@1b z8f_(UV>RZUR?u$dftGkO4gf*olSf;7MEh@3r|8Iv@QpNl{yfmA9swZm`{9bZJ}R(z zGfoc%W1MlZvZXUtNpoLn@4mj=aq$wbiH=OuxWEE>iy|r_VyDqJC1N(Q*O!&URMce; z<_1?^00w5n4=rg_yH^7Huy}t?EVq8P?lK^GC_5!%c$}5iJ$1ZZ6sC(^=xObkewsV?Sq6)yX zO|ebSn_+Hs#2089zUF(siQc#N3xxIQx%_m}EXr8!?@K~dB;3y1Wpf-84`A(#{VY?y z&zGdZq|)W5$m-}Eg3+2YU}p`Bedr>GxABxK!r&wOjtiyeJjITbrpwC8TlTzGR zdZUl8Lk<71CXLw{{8>c{K(!XvdE@+Cgm$p^QwRM{8~EJ0qNrl$2Q}*@xfETRgMI~> zr=58sU!L09p2dyd)joOOcl;sX31sQVz#Tsk12RcdozM zTEAiM0NIONGnP3}JJc9~5-ZQ$c(loxwA)>4#(KkG5?#eX-TTeeu#I`*%Y8odQ5+w& z{d8whqW821ys!%W3aay}&f~(n-GXigk`_7eX)9RXck+o1_N=qmY3p5z*V~Y#jRPkA{J+vO2x(J~=o`rq03F3az8IpB9?j})+ z`_k)2p}V)s499?Z=(Sj+m2F3+U6LtJY5y{aLrCGBT? z$>ak4>7#T?3eLQpaKf8KARoD3tNTEXp10zAObBmjXcsUmN%2NcwkCR>#xl*ef76Q+J<%;}eB{EjBeBycu3P8A<}sQJck+1bPY1Qyjt+(8 zagg;u`TT#GCutlDK%af?7u+fP8It5z1;K{w>51ZMr#t)%UJKXGplIsarJUc$;q-Uz zZoZ70EgG9zf9JDEPBEaoj30>huJo6*Xg~Kh&wqfEt&itWh9xdiJ>Ri*+4G`h7tUZ_ zeTON9SJY7U)C+?_2aSonb_`AJM`kw{J(fFs^r#}v-G$OAK|1MeEA~mMM8CE+%sgmN z1X=Aqa?&tY@f3H{Kultt!evk7$=en+Q z&JW$>Kh`mALE$qhOJ5*!RAk(m;Ak|7$epNys7^AiXF8wp)@?!E(@w1Wy-<~S!_0-^ z8sS&M)oF(h`0&!^c9v_e3#%^IUtbDV>6Kgpu@T9LUP`)JhL=vjLghCbO)25zGHW1e z?5saMKk4>+k9C^u`Z4W3Z37gnfSXon);~TYrr|#9ww;+w(ql0*f-)7OR1~>Z{|L0o z0G`iIuWEvP>sJrTicIw( zKSyee_zio;)avaUeLo{moXpC1DkD&5{r<`@-Yj(!j_s*ze0jvo>X8ZygRpx^%#oq& ztzm*};Xfho-3oWu$}dP#*U=5C8|jpPBmtkbKK8>QwoA`x%Rd|Fz&9=kIyQ=nY79f~ zMBuC8^PaK{_rS-->L2%G`xs^?TyA{>-a{?fF`f41AnnpOtE_|&}3hdsa= zF!os61{v`jcNY3B4mJ8M{`AyI&nYSj2-}m}0$uKaUk_7&W(YvjyTg4{?1$t@(=rgT zbp^caXj*@xJao%f13*CRJL{K_;o=hI=-gKpB^2xw1>Gj3EwIu%il4i0rxm23CNF{0 zIc6zX?disk^I-0jxFaQBDV&V9TpEy&ucMB9`_K18YfuC>r4tP4Ahib6<{i#jB zvPqOU>X3L`^IcLyLc2|A_}@|MHRh`eH0$QY2U#v|u*HOHb=a0rgpsv>C>Lj=-?_vw zth>PBqT)~f$#z*%N0rA+)Z%X|yT_0N?!$83ETGqo;STfQco8FyA&V*MQ3aL9yoxBX zJT7IXKR5B8b2GpOmh4fUEc+?Q4WZxei-Esc8D16JC~Bs*m#=yFm$U`#C+c6+LmTT# zQ7rU*Q4L*0VF}P2gN*i!Y!x&8ba-`#;Kf+Pwx`>oeCdZ{HY73!Zh!i% zq{gK5M#l}t9;ZxyCnox7&a);dJ!DA(+sSL#G7nwzn9=}8Pq!@P9f(ad;5k4BE;#*`FW(Dn?ac1_Q%Eu z93Krk=}6uYzE=+fUG08lK2wst*KcG9&Pw)v#@^Ut)fi4=j4>X*K>TTkJOmP2sL46S z0>gJ3G330hB#d@$x(Hx?3mj~t?Ptfg@#hUU=jTR)HUdYm_S zn2r1Tm5U532xiZ0EeF5wK&@g0!1Gv$d@nKGjBuLg$_1>#>&&ZW2HgQCYV7wYnYQr3 z`EwzO;%R(-{z-#0%4E3)#sZwBT7O7^QAc;G^M#3BaA7QW+cB01eY{Qw%s4O9Y@=>* z!VorK88Lrb5k|i>g15N<#!=KUT$0fi(JA1RW4au{@ae8)2refkP1=EZH|I^=g5XVJ zF`^=uih?H=y!fIP3SkhXxdswj9mLGMSesqb#7|5+p4a{tSmf52Hs#o)40CsN4z}fJ zNmEk5Q(_)xFD%uDY}aX)2J3XoDQv|vJ@TavDAT@F7LdJpWPaKUD|P%tyU3)O<^iu- z_~RdXaT5{N-e2ClFT>sbbminO`;XG}AOvsVJK?ao`Rr=V%Frdya2vBw%+$l0y9A*; z%MI#z*a%mm7`7$B1ai0Q+$B!co(nd8U??}#G8J= z&c!KWPf9uN2W6wro!I?*hE?rZS3T?OGG*f|8^V~s)Zt~z9HoiAAp&S#y`|q)ORW&` z2kTprzKP9fx`Rxw6P`?dua_qljSEqK$NP9a`e)9fSU65b)vXA}ze{|=ioi_t#$Yd@ zB~7o)1@0u#FTTu|2}d&x{J@s|_v8BHIeE+HHdmAs#JA!q0X>lRialw5NS$xq1;a=a zzA%oxC1?Bu0QCM3my!`wKF5k4ybWOFo^9*>98O6V9J)4?ZHIO0yzG04PBL&~wAEA! z**$+^F~`ma?e*Te#i48%t$Y<+?&eN{*Q^?OBmun{p=XPUJPrqH<#xLe3}US(z`yFD1vFv|k)4!PAC`(n@OoDeO3Ui8wH6V=fFH$6Eie3b2g- zXae)}|M%wqKXS;{K+A>{+2!;@7+k4Fm2;fHZ{CkHr;xS1(Ec4RIL@9-K7eV3Bb;q=1Y&|xx($w^ZS(7h@dzt~O~ zRM5;c7rccb?K>$bPr23^kzYFeQfg?e*(quyBF`1@0Yoih7P3Of%)*Gc^CqCg1Mm2G zpNp4kNuAKfOV|e|t@t!#=O2L~6K?8Za(SmyWA;GxOvto^c)U<}QcrKe!O*-cf;E}u z5+Jw zM8hPdm;WsZ8TnU;Dzgk`jvK|X-{~EA$m(sb&R<*9zg3WJbo{bj=WWfF@|>a0(B+t& z-7&5z?(1dct1>m^70UGz7QcO1PhzFLPnmG78R$aYY7je9UhW zWg~DE%ms))IJ~doA5#8n+{!mqd|?T*=L}M7HT=2!Lzjc569pUfx^m4U^%}ulHOdcJ z-{wxP>j3A9q#Z8pJ|q;?D#wcKtf@mUC?N3{CF-OkF&028(4#e3O3Jxg_gqAYv74rX z48tMkuk!GXaT;wwQNzyQvJkQv7TSmA#=|K@%XGBq6L45S??^pRFvHAIbC^j6Jl0T~ ze*>a-u2Pc!Y4?4CB5^vImN+6Q0pov8`+yA4ExV0Q$PYm&ika} zR-9zA^eto2zBj{N$8&xK4t3b%`tip1zI99cGy5;&yp~oy{@a-aH;Gw^L3*;@`5si3{u|<_4AaayX_7l+GKR@oeLQ@cZM64~feb2(t4-!kO69Ugb*6RK z(arVmny~M4p7cQ9A|PJvdM9Lju7%`Yw?K9jJe-hqe+pQ)Wm-!Xdf_N2omEsZHG4>K zltOZ@INM(Z!RFAYlLr-Ua~!L8tiA(9g`Q}x$U>`4ugA9qv8lGY8cm_6)Kx(cN=FI% zU{|r}r!bh$_#x+Tm76_~OzW$I{W|(DX+-Lzc=Pfuo&>>^6Id9TPDue_ZFsbKR*Pdx z-P22GX@m^Z9L1r@x@xZ&oV%7xOG1LneLB;0fLP(m5_6QH4xzqIv4C^QyC12IP>w46 z{k%O}&S0o-x+sn&0-MLxA# zAXswUM|*+en#@6pKFXvd-MkTvCK#w@-vV&Y7wU5BrWTwlN2oJ6li$~Ghgz5!`}=kO zKs7k)>ZsUXzDfPZFWp1utZ+YBVMuQk+j}rvkSN=IgG^SNolDQWCvq-(;~;0zFkLPUn6?)i;Dp3jnW3R? zmDr*5nlEA6K@!tqZ1gSdR~fjZ@w@n}Iw9DoCV&``ALZ3SkKfG#we%eFoE;5GMN(1@ z@QFQ3k;6xA2R^%>(13e53f$^>BTem3t7pJXWEfYRFR%FDdG7K$#ZGC}Wm45l9OQv< zLIXqi!lM3BDzNC5U*>^t{zQ{+uys`kwtM$^sOE86&^; zD7zIK@q?|ctl1hL=pj#mu3GNjaFoc#!|uD4jqLs4+mjOdQ7kvRyn<2y}J3yP$uk{g}C~Z@45={?7injUp?}7hTJUMkadCn#N@IFEHun-wuXg z-1tYG4auMEkb38|R@inRRHF%!o;}q7T+5NeiAeNL+_bD}wK&@r> zdHvu0RJ_*b!ymvE^Oj2)aoUBURB>Gwt?Gj^*QdBjHw4!&0fO%5Zey&S{y-8R9kj9M~Z3W~wRWo@Z_dnITKDFb#kg zbRC$edY_eR!UjGKy2HV#6a!r+&Ur1Z_?Las=)T9yq-J8@?&D3tPw`urP0hc7D?McG z50Om|4`eWHK@l`#&o?TI-!Cv*B4Ks!ZZ#c-EzW88_!6t1QMsls^kEYdU?!V(?bt(Hs(#Jm-F-gMKWos{_L)x~; z@yoSIVrb><2jFNySKx$c>P!LCyc30^>~06R4%3im_wQv_k??`7R}@+oD+WP_gz?e8 zyhCsQ4%DqxmO|)#Z>1T6MRRCi_N8L>!~RTnT9s~V!OW2Ef?x*3@>ou4khM`?(NSO3 z>U&u>`sw>8a!B?xt~Xq-fJf|o9L3guf*IsHnys_ZAEelJ#4z5F&UBDtpDHH)_X>uG z5wZ+RSTrkLT_$;E;R_=!j%@G^2%CxYH^2}wrbC!7;Xit8vWr70Np*?=FOP(=?ZAN_ z%)o!yQ`r;}EQdrg7}mN+Z4B|+zIfLLUZ(`pdW#lC^3?rQt~Nn72}EHLeI$43?toYR zAyp0*PGgPDE+)%uOf!jtLWcc;1KQ{%VdhD+4$RZnV(j{A>rsd8BPSzYqXMGN)FSYE zfj-OJFFQh3X1pqu|BV-z%(DcXkNoZeAz zR*qd!(5TPc>4?sxeXt`00G>$hZ#D3FW>6$kjda%|uZlRS{6$=0b zt6kYXYif{Obp+S{Vjyaz@mFIGD;D$TI84P+^g6;@xhn{+b=PxFQ(rjR`AH0IY1!7W zxgF(>rIJzez+x@mk2j*1TWqT{hHOJKlKc{B+;v=k&Z4qR;f1Q3qOJ4GHOGerBJVu6 z#zwxZkpP`c91{?MGFGafKOJG6qZ%eb1p$m|)Hg$~Xl9c7+IA7PaSSZ%Gk3r{L{6(G zpi$>eoU8?l&|+&{>76=SxmmFCvn3V2{>EsmFzcvphnh^bKfhkEZoNsV4Pw&mD?8Tq ze|C~Z$vPX|_mPn=O;N_AR?XUa_ZroaWESBU`Sx#L$*tkgH^{iI#<>RZp)*ez32dHoO5?s5t}X3;OpQ&- zM@~kjI)4zDXz|`Ut2k8n_S`HSlV`~UoNC@!`pHOLOU3uLty>ac zeC90Hmr?nkrIAJejs+MI_bD?amwrrj#y0{g4%-aB?!lYN!G{Am;7{ZWq>A}{?Yl?d?#KY1Vp_z>hX_~ z9m|Tdr~rO6GiG1idgsr|z`8be-9h#)8zO6(GD#ys{i1v3Pj?dB9#3nKJpj-qfjUp*b2VBdIj=D4C7 z?w;t8sekp^t2hghi-~xw%g3(8>FRcVc$=-3% zZIg~=f$NU?wn3rwb;4cWTvlepKB>6D{+#5H$LN*D|G28zDq0`e8?tyLVpC+zw9p`+ zR=O}TT(<}9!6~Y0mihEm&X#vc%0qrQ9ppkNE>5SGh<&&L>!a46>ZnB2MKKQ>hZy7{ zBYH32felIj>5vUGOx+Kqrxmpe$jz5iC*3&mne5ei@(A+52E`Z76}T?eTts`XsR*bN^a~ptCN08BE3g*^T!HC= zdg_>n`;CniA(K1X&5zD7;!O%n6UP>`Y1E=5)6AlV!g1fo=%jpZ4#w_oo|KEerfsP- z5rwgj#_0In;|PlZdvO=}gIafba?Dk18>!0nE6m?K?=U_S@=vYF>>5MIKQUt{kO*!{PIK-{csYz3Y#L&8 zU;K(ndg$@A1$uH!ER$14fED_WS>K1f-)9O(*CsOOA9W<$CI5M&D#{dY;5aYk6eL_<5EM37WLhxVlp$Dx{RhD_%(Wqk3I~;9+th+#bLcgCZ}L zO=mn!#`&l@1JOnHluY(&?Lg+A?Se!r17E7Dy6|HwV z`25O!6&6muosHWU)6TQ9mz0)Mo2^e;XoF2@UZ2#MZm$TR(~pKLmd8*en~A)9E#qZX zLaRy6o%>|M{JS`@=lgSHqUw^3;g*a855CiH$9D-}eMQr#!b~FIRhO&V=~cRjK&zI6 z6(l;;hEt^Y$Gw>hwZF5n!2oNrR&V`*@uVfU#Q*fS{S+niN6e}W1*!r~a`4Z)I*PS2 zaRER!^z)?Okdl>SCm7SFG!l87f%DEYjX1lK9jxlWFftkqslo)b0$O)&MFwq0KdV6X z@EnBKEnCA54PgSiq=?{haFgAnnv4+M)Hfb?mif6UleyT%$EGBN1fV3h30iYB$f~_h zX!Mde;GBDBK6J7?pPkOxPHHkczs`kSS}y-)y_5x7BqiIafNNdtaXP*j2D)Up8o z)>L}_9E3OFPRy%SNy_3hZ)h^!o=E19tm1f9z^RHKnuha!2E0zBixEVI#;g z)Vm%a@sG$}t1w27PY%4b5OvR1D&WH&x8y?0V(}fRho?B|=L@@EFtfdG=Xw^oEOn2q z1Cz!!QZRpm*58xg=0hxaN)l;e75~1cPQ<9Z<8#EBjNuJI0@<&Hg-DJg(dY-^1rELCt)kqS4m3fw z&DxrgHl1`R)~?Gc+z;%)^=z$ENkjO}4k3ZL?h=$&7rVkyMipyrk~hMyC7DM?ggAZ& zY|VSGI-^Yd`!?%+e2WZKU`C=VRK71p^^aGx!VucUVJ#ELw4OIuY83fH5mUyg3Wy5A z9bC7Vi}%nEqOZy=S@=V)n^EE<%!92kBbv4&|NN{n-fO!TX4_USkBkjP6u-zm7<9l@j@l-{=;w- z>+TO8(ZkG#epbAU)ZHJu_j*l*5LN2AKRSwq1v>*;zFWSWME?F8T6!ADh%5Zr5AGk$ z?6gfI39BVO#rZt72#!qp<;C$xJBdMx-QV>4#pym*1_d$|_2h-u0LRsK+yNs!fb9JZ z_Rd**b<(nOTM(dacS`DJy)~tF*atg;&yWgNw0?z8*Iz-ye1-JLFIXY#kg&VXT|fU^ zEjFGlr_86YUu^!KQ(_s6`5;pSQvLkil0wfzKl$h*Mb(o!?tj0sQ%z=U9N>TI17XT#Yhanlw982sEZj*I*$ z2h$mqN=KvR%6;kWn8fV1m9r`J_#d0_d4Hc-j@a%BtBt?7E<>**j=K=5l8RW;!@c1u znV4D{=+PbR)7zpsA!H4b;w&bw|Io$U)}idKDdg(k35N{#tIP6V9VULaR}R)bs_r_i za}@^Hu+cvCYTMbHPDu8U#FGp?@EghvI#{%kXB^nu{+!4it8~ED{-#MR%i*Kbr9OE9 zpngfb%Fw~`)zwI6MoJ1eL)wCTwx@|HFg`j~VUspe)Qie+*^`s>pqs(5je;VgJlgBYOT8Q?fDM; zg5YD}i*QJ8d&BU|(`|tjdi-gl1Ylw(^tP~)wcP*8Q*si0R*jZDGo((`guFJsn+*Tn zeOB~QB*NGDd{%@qc}?BztL&_+G&O~Eq;Ec)CNplt{?73vPB^n$4nMk=ouOS_n2AIL zjrd?a+~Q7^&Y~DKD))ZF`m+tFVlq_vHv?i>+<$6NHNyDCCvoodf=_=k78|=fm^AxP zz2-tdHTc!O8{lI>}F`pspQ4*%%OMzi4Y`~{<{xD5e@sH^Bd}N zFb>MtKF{^6rW*Szld-*SxYo`T)fe@RV3~C`*Pw8bgs~~HCh_8R*qXA2O*xf7|5$Fh z;%kjNFy>nkl7{!IH2Yt-1uaAdq5}xlB|Yn2?^@{=aHJAyl((W`#HL%R-Psp_n(sYK z;uF+{IM+kBKF{vzRKLfdjJ-lJz8SvVh^nm$aElJC{Q80>_MEId>MzlU<$D6@R9*pX zyt2a6;d72R9fvk4x!L}ka~4=*svAsz|dF}zB@ccy)={O{?J5rNJyDsqFaSE(3A>NS>w zSAZ(hz4s`-B&m_-%Mmx`+}NVylSifOFR_Ti^$*x+^`L1nziQYVnBd0a$o#qK&9ml_ z7rcR|o0E&2m{*PY{sD+Ql8t)!P=+C5)Usp7Hq*P>CbP^g2|&A8*nh=5PQ3Bc(f<0D z6cz&3wq$G1fxM?SRMkn$q236>j7dI3N3W~uazY`X$>@ljriA>R*>t>7NhV?Z*I<#P1n z5Mr)vD0yMb4?g!^j4l#90@94a!&VodChrBU8M0pv9X#J1a=94)iVhq@%)St@=t}nS z@tho$>|OmO00!Tm1qc!#(oXXgJw*;?eX^a}kOF*O4qq`IlVl4~98~G+6d0uks~TbGt5cfn2x2+-fR6F^ zA9!s`CT+E%boE1ZU^ONsZ=LROiFJoBi9R8i}?Z#)3av=!?YJ&T~h&B~^2**KlNrC(z zd3;22^|}ze*C&>egD}K>kNpqe7`nUAtat4zKCYSAo*Qa`vdxX@h`KVJ~; zfa3IGCe{-@;{j$M7hOa?s=YW-Q_TAwlx}_W)Bk(U-~sQj+bG>;A05iG&OKlAK9 zYcLz|Vhw@j@(|BCDL?EHGJmyc*a>qZgd8A}#VlAv-VF|@W!2svXfWoB=Ua>0p$_n~x-f3wgevZvzxwNP(yngyr7ntF1IR`eP2s>MMZ?3?Nmt=4 z95PBPEjQJ?S2+l|=s+LUjA~+Fi?&WF77E^A3AT6=zROD~$A)yT_~4I$uxjwVS7v)< zBggW2)FiFyo)!`3+!vV!N}uNynW?DmT(+J79v@&7nI%cOb6w#(S<_}sL|P@Xwf6>! zYE@fLt5&hz@NVZW+0A;`F&JZQk;z zRLpn;3~BQ~j9R&}&vx9%*krb3Ej6f`l1RMCe}atvX2yOI z5gIJP(H7KN$G+U~4BBQLTD(<&HjS##r=9Xlu!q{m9F0?sBW0cbhk2zupu8tLI#4o$ ztcgi_nL5PFPIdO6YGtDk{jS3L~cOC%$27JKw8yMHt`f7oS9| zEI`i;Wt#@GL-Wi58lARL?8nvZ_`|Y}(+L&0+9?ub-Cdb?s+lre#~}FCBwyjjP>Ivp zNP&a${R7LT#~u$vB6KP9OUVLQidWU_S_IjzKT&)>Wr$Tlv7QRP(+$u%DT*(3kPAZ3 zCx@GlY~NBgF#6b(my0&#y-#@9Io?@bcNc6RydC+BXCZS(8*qPBSwv$$ojKKwrgXk# zp{Lce-GIxE?Y^EYEi(;_RkTVT&bvoFc5N`Nl2W49dV;a>Br{pR|QTDt`!fTFm zshVLtPT^vJV#KZ30~~H9=+M;j5m3v%bRshvv4;||fjVOu{!VIC6y%LX`*=P~*rh8V zWS>?P_Z#XomDXk*L}V}pMm*@C2)3M&aFJXG+L}-@uu?J&Z6}B4JLICXu1Z!YuF+2` zA^}0Z@AfrFv`!0EE#lfy>%1`m8RxCJ!Dq{0oe+sde3CbGot$ed?{!YB0AX8CBB(<6 zjtF;pmVORbt*b>{TYIHmExD?kohDlv)SuT*x%l56knH8oHIvEV1Im~I!ij4HKSR5a zF%9cv#~gU_!uHATSS!OTLSn~xvKp#>-$pY2cf33`#o>(bCa4`iD>jM!Zhi4K33wDM z{qk6l24RFZ)5OYkEqlXITCbnVNdlFk#u-93JKI*V_1Ws3#B;Ad(LJijTM5!lA!P77 zdVCwSj^veAKvC8D0OP#u;)cX(e_?P0Qq&+zQtn8WJuPO;dFM+oX4nf4Dxd5cVEz0^ zfYeF4b@MJ}mAzjH{DFVzSD8a~gyayG5O2-9MjAqf6wc*N9g;5yAv;>wHkV?8#Wzu6 zbYN)u_boWU0^F!(AV(rzG0;!Dl2=*3Z_5e`xpCwLu*m{M1>KY)O^frrou2BO8tYGB zqAiwu;<@C9MB$TNoX?FmQnR)z>bAD%g6)uCL4DqL>}F|{yXQ;95;M+(#LUPK2;V^x*@?v6S(_dF3?^0_xf=^n8}_%g5D;wFZ8vP!K^ZQjNI#FT;~;Nv1$oj^ zgoP|591+i;bnLN?bg!no*fY5&#us<*Yh`pbxCWZF9$)qYjmjJ`Y4UT5` zDw@tj;hh?>*Ow3?H{A4fd;pKvx61`zS_wLrb;+V;hR;?+o8M$%A5lc4ILm&LyTZ{w zt;!4^6ug9IXm6g7cU>zh@mbB7yqlTwf^G1+Q)ELy!xenFTM$4#{=Ba?kQcoX)go1#tfaM1T);DvIV&?aqRKSw{y3Oy|RG@B-!dMJJa{qGn|$J=<$z0$Stsq z=lcOXY)#BEmCtcrvF7gB+Us`@c=Z@$c%QUXJcu1SeTvC&8taZYgvUOmqR@{nS$G${ zLtD&K&CD0gM9+I|p|od-x@YO*aGefJ+XP6UFs|te#vFVjxGj2N+FGSMiY_w><_>`J zaTwaVoWd)k7tT71wcMN>x`9mee|AH+`!G;Ul}&83f5A;ao(=yjaiM<0+%NR6^i~;)qU#Y4cQ}?owX{B>Ef-qt<6>ouHihfd z^U_*r$mVU(j}8*Lem7C`kQ-C_AgPo&xC1K2y5{sY2zPV#A)S@Y(A6iUt=Ci(Lx{lg zSTR10vB(a8coHC^qGY5^o^-EON2x`KF7lKIX7{t>ANzJx#yaaVQV<Xm6$hx}P=wZ=RV>||_^xWFC)8t||&M21aEK-4YQgjaN0Y^YF9^xKO#&sgV?CE_|J?0Bq05C7=1ZlBjiV!&S> z=)1Ue9X5f~(1iwJA_w0!g}Zte3nLbJu!q@J=7$$hXDc(8(D+q|+gf~Ou zUSzoTf2hPwO{s#g2QiL-c)d`?mtRQ1K6;ZGH?hHAo_DnNk}U z*uD*v!CIA*eD=kOR`u<%U5MQ?=9gWPjG{&5z-9CJVN+vTnqd45jNf-J#rxZ>D_5UZ z&wvc4raelw%=iuI#^hUG8Z9-(dDVk{wOaNEcOMqqm5%i_E}9naP^d68hF#2R!c4~* zlc~+K(XPmQ%Wb4ecr6Isj5q^F*7X}XB<$q~DJ&1r=zW2#VEZ`MK+#(f|L7E+X}aQb z$uYNok8U`9jr&v;djcj>!SVVz%A)n>us+?lkE3|#MGlwJq6oAs(UWirM-i2( zL!K5Kzz^$gfscvJE+3TLXi+1zUp{961x(!8cmoTYBk1NN9hiUN1LBT$agw`X!P@1* zw$}iFn~}NUqEi6CLgANdJZ}HkDL74%=S2Gc>~$^CypInYUokGuoR!}!$mjs1Nje?! zBKQKdQM2MX0A9d_eh4QX@x*}xhvih8nCX3!v~v0u2@>XRb9poMf6;jZP?5;m@9}lS zX0=&1w@eSl*Z-GiFu^ZL($lk4kJd3KL3skQp4eB@a(ZFztvmBy&eqgb0`+G+cHG5G zAHn7=g@BS6gHv1lwKJ;gO+Bk~H08xMEKBfzQc@QqS#WnN z{YQKB@3UfBk$eXew&I`@(=&+PQL4UCQmXi0O41A`hTL;ugp}yBe(q6L1c~vka{#$B z$AtD}liYJ~GG{3uZmvO=42dhha|L5v(^N~~6Af#Z{ED_leDXPti22``n{`4mSKzV5 zSBn=sgHkHhAKV!3bXg5K_1oezVvcS^<04|k*1mplUv=$`%k}5{dd_pT?&GyflY@$k zfyR9;VM#1w{iV$y*d(n-@bEn-Kffz@lDWO-%0M5%mxC<5wNBBP$V2GBTp4wo2ATXD zb^;$G1au2!*P#1VYl$}OXVQb7}8uZ zBd^=H$D87)j7JL8K+Zr$4-!P2_dCb0o*6;)mna8mG&ML_sf8U( zIx_Yda!&!W3N^)d>xQ`>>}`e>lL;Ol(Y#tFpw9(K$T0Ug`^hfXdk!7YrPjvARY2+d zFgpXrQY;2Lr0x%K)b0HmY8_Kv+zvL+_pbbJM)GBTnQe%eU%WJ5S4I(p%$HiZzcvk> zk(R+xezZ!#f#pStjU3GRd_Zm)eDVp2b3HZ)Xw7q#x8lXR&E6;oCxepmt zW2U4r{{KqJ|9ni>Cy=}=v<2)c2)9duAU_|lP5R$D`0sC@MM#G+3rF2mib3f7|Akfm z^Y=9*76^ZFgL_&mFTRb|K(J9dyW5H;wKtQ^EN3eH6mJ!iKA|i0&NQ2n{heh>GDHKV zc**IIy-`12?K@gYYOIUsTd`<15@!knvH**7b^}bEQ0M*Rmzz#<$cl^K-V~#!oHg zwYOz*NJ#nk_!N!>gB(`7ijc|2Jl^7y) z@oY{J;q6rha{$Uau;UatR|lm!`BlzS3mjaHn+*MJk1zgNBDUbZYNcN_V!}Mk0*AFU zr=B^>YSNwJmt4khlBs|HcxfosMFOguXGW)46S8hyZdU8h7~jnBEE`I>(0*!;PAeKo zC=t~y-ntoD_cTM#I7Rqy$K_-W^OvSwKmDKC3St1tURy{?GfVAnVSVoHcpBE+=Y_o%X zEDXIBu8KsD6CPlOciO}#m1kr7M1~e~40`{%^17TF_D8L{i9~O~{|JNz;Tds*C7#`|N006K?q2M>cwN|ziSX!qiG{u23}C%a z`lwD()~|W6a&7|SC3H38)V%c7`e_;BueT{-PZ^_Fg znfck*I>MZyz}3p2X~ortoez`<877wmz4N1ro+UQOULaa&9$*AK7%cyGV^+UsqG}&v zb@>Hpx`o6-MXwzhpYZxJQZ8clYaN$VP7NzcxrozVS8jRZO3wma@AnX=SUR^p`tqyG zgf;k>EglGNSIw;UTs_kXAGmSlp`riT|9>_lP0w1u48XJpe@32em$wyv<&*hglO;m; zJhnx{`g-3W+K?a2-eFIVGEG-=v8ac))Jc{soU1RG%X#ieh7e-Ne0m3Kgl`(Os=Owj z!)Xprrm@Ei0m3gl-HZq&1v%!vKkPYUVHWhJ zu{h(K+pPO6I%BQeSNvSnj3uL4eA)zTGZ5DxH1vj^d9h zEm**YN$^tf-%}=YKo}tKq5Fl+b9Xg7Nw=n(Ekb#Yhx_dzClNlmX&cAgkYv-Vs@mp+#hgFHg# zm6U%rUYZ{Eh0*u%qT8qkz89|800YGm)G|~SVywP{KIunnsB9dAK7g!71bjU6_;%GW z*z$g_;fJxZ>z>D3>^AK3{6sLQw6B+hL?k^-#_7|54f$g)1)U578Wj`^`{G1DBJ5I@ z$7xQQern8l&OoG(r6C|jF6>joUz_2Yv6j)5k+BrBAIXPf0|75APM0ta#1vA3G%Co7 zg95tEww>+z66RDPU1Qa~UsF>y6=_-@X_fRy+wcMLJVRt@Ee9u_Z!fW*7tcR_NMp{L zB3yY+0p$n{Zg}h9hk-mQ@UNzDEFZt{YUW?4r|&=wtYrW)d-9lAW_#gjwHL3 z6NXTIPe<>Zugd*tE$t)#2@=YC$Ns?!JTW6;U1uBemYhv;Kj1xDHZO&jb3Fe;2zsen zCEDgWIqN})GtvcB)IMAO94LF}IUSn30&Pi7z6G98|MZscH|X^U|6vrSOM(nQ2(KYp z>o@EAKuwAwLyz#Uj?P#y?*qerh-zUm3WNZ2i*^IlX>2)BmPsnD0u%(A>mWM=(Bbc~ z;Ux`rfO2C|cfXb0ISaEX%}&}8p3=RUUoK^lqH|}nbbSu#{f%43ffJKYHcTwW%=Vu? z5Fz}A0;zHh?yFXRS8d^|JUAoMm4o{@8Td;h!mv+2raRI(m@Hm z2Rw>^sPrZXLPP?Hf{=tFMOutXQ4s_cP`dQqi-H6}>5w2z2oQRJ0J)oUe((D~AO7Rs zkN3-E?7;{bd+a@#d#$yfIp;H<1>_L|(G!hIAolZZ?|#sRmgMS^rc1rmRE|iRZJ#qP zJ*f!pXog!o6ddr$3g1)=3@jp#I6EsfB^gg`zelZhmXtfyEkDA+63zz1c|=y$Ip3W? z?f~h-McnyT7RB|20V**2m|EAp2WkLT3b_Z6kbJr zi{BccaFe@Zti(UFU5S67C1^PJ5lD_oQ0USX57|o6r5;3>Op!`kD*!Ic&(_5j-l1HK zB*et*SAqH2&lq7F!GKI{nW@m!AhL^yp*|nbat3e!rURcaQCh~#h;fgMIJANC&|mp$ zTzy(s)vuWRlkgwm2c!L4^@)=Qt0s;Y`isFL8=7Z+G0b0_mK2_n?SFRLURm9^0rthE z{M3}LYjxkVQ!Tcu&pe$^a;rQMs*CgdAu)S!Y$|lh5~ylS1GPbwKY??Z_|Ad*#L}Dq zpuH(7GHQ>#HEoLP#2iJWLhFpu7Z)yr#pj+bZ3-kZwPuCyJs<-%Aowb?F*P|4kxpC3 zpSW@2KC6g@2SBsBH!b8Qb+fg4#14Ev-huj85KKn4Lv$NVpekom(l>@WymYGxZG*+A z-=Qi>4B`U2`BoFokYg5V=|DLTXb<^t%z`_94j5FoU(yc@@g9Bs?FLyw7pcU=bunI^g8siqAl zoyD~8A-5B;f_YtU51yc!s;;MNk>mGcQPoavLK-UG9=bGB^BZ)9EO0kSjUd%33_R-Y zgT1a59m-P~%jr_h^||3a8K5;NupvManze2poSd|F8|Y&u5JDuGL#^jWz9ces|4H_- z%BT>m`?**cupc;6{_`Sl68QDHo#D#|-gxXDEwO77gS@1VK15s!R zI92*~ZCRV>JsJf(7W-(j`!K*k3x1~wuS9MJ<#-^|sfAf}jg<@s=mcn?`KhO-L3NQ~U5}Lm&f3>?LoOh=$10Y~JE8ZiX zhCpIeLOM3r`jdv%fyR*pquU7YypEM0Pa;%1@EsQ!!*lp)09{o;X|q#`QOqbRzRu-Y zvd6X^toCAv*B-FI)IGNnYRC`_8vnt7ye9wh#2q8GdC_!(mnTOo5lMbGCXuU~dFQ@y za?+H#5NXjQ1xl7ei@)|DXL|FRecf{1I*eIxUi`!*IPq@kXw=Q>!(1jY30|rL=t!0@Z;VKh`}q< z^j|Pn!;{S=y)3hCym^I=WE9pgE+NP!U}4C3P`)3nvhO4)9JKQIv)xt6x0A;=Xe+}3 zZ(Y?M;L-q9h`0rRjKdS*M$s`P`R-oy_hI!W7tvNM=d5>~U_b1;YxT_$Ma$M}Dgr}52ppNT9#e!*<60X6uz8G`T$0?!yXZ~Db z_0zmuDltL5Vxl9?1AR#q>pVbLF6m%qcffh3{yZUzXcjw8{cE6xYAn17^jk5SJy&_z zw2CfS|MnYLK3T`rHj(WIVUW{aQ18AqjX zT)`Lt!A3{+vk*tDql!zRYDR_Lutu0})u+`Fr?f%k`$3Bb1Ye=0mixGyaE79v#!$Lh zDi6o!%veBO1mxAojYwqgz3Xo7BrLSpeZlN>C~+G-S5^q}L1-ufTd(sRDphc?KGJMy zzD8O;xh8449`%U4g+KOU|KW)`+%xe#@osU?pLa4Vcv)sBDbN$DQ3)%;K!QSqzF+1v zu*8e-KVsbq8Kxz zS{~0!e$;S*lwGx^LoZS5MX7ILWe*jBD7aKX^nWKuYjyB~e8 zSiT7C7cP^clhF{zmpfwlKqSz1rYtS{z|EQHKI;Bg^EsnyIeTXKfn^8J(Mt%(Fs{Vw zv3V!G-dd-$Qv{kLcd>w6klf7#8@TiN=KKyfUMz4Rc9h}+%Za-9o)dG?Z?Vq+?|CsY zU+xr2D?1@TwC(AT(L;hlK)x&8U}p0@QA|yv2btVWG`ee%et#0l zETy|1igP{xGuk+6>dosx#SeK$@~3ugn9I?13a5I0!E$^LXkF7&<^X$gW{6^V*!i*~ z(MCC46Yi%zW+j_~jo0)sm8?xf8n2v7Sf_lnQxy+Gs(jBwN-D*~=F0Isa+GTX@&p%@ zyw_(|1}^=HmB?zQkIQ@Je?nb>qhw~k-AHOTP3fkX5wVxPELG)Dp8hJUb8QP7333z( zY8;Q4Z{M>Cxw=KxuEq6#*3^{q6SnZhRqDOrId2P`rLMMSY>b5K&4)x@Q$;Ah!2t=R!!IA)EJe)&C8YIdSNAHiHg5T zym;2p-R@a;{6K{ghmVf8>g=B!7>i(?RR3KK)&4&0pxVoFsP(aB!QsQ1RMs8* z?E2FM4ZOjl<74rvP0rZ)O%Yt|&DY?{e#5pv8#l41jwm{K7u10?ie-Mn*a7to%RLqz z=Ks|dz`KVs{cv{^OknlC`v((0;Nq=aWTxpi7H0Q+(d+49HPW$7=+pgD$at!$<&wZn zvGLlXrz)vVnnwzaTf97JZ$kB-Se*TEQ+-8cxx4Fd#xJOAGdLc;^P-C`zYl~rP&$x-n3;`$1ZI?ZT61>|i*w|Ev|E&fFR-w8k?l;S*4}1K$MogomNuPf@$< z;m*1Ii8>EISsQZj+zC9zg_*u!2P+hmRtdFOd=0)6->FytV3NL+s2_IU4G7@5sdJDR zuoR&8*9t$*;^I;V6oyxuSCcWnnZdD|t7jO1K5UTNAF%}12IPxtL(MyK6is&B{x(jX zPOYD&dOxG+Jh$xgrqN@?#IB33N#a?@44wCzH^=$B1>O`-O_hp1y_iBf>B{Phs${93c=15xQI(zjOitj^#ZidD?b2Hwjt zEKtzu3;eMRCBqxS1YfD^@y~Ei(nquHyKs3%$yPw8%X`t)#o6__SwD&O(Mum||FhgF zLs%(5lYF`R-4(X@exS!F9@4?(BR#!bjqi{r$n)y%f#DT&{SPTW)uV`}(4znq-?4iQ zP`c9uKtUnZFzSQyV|d18+sXiDrTs=Eex7%`Xo;6`4(K}5qzt-56d)bNe&vkA-TJbYOLq zI(i3l?s5&#Y1Ir~1`7V{Tq`-q+~8%*;xx_z=!@SiHt}8T%n4{Kjb)HHFJ)yY`-2zQ zUW1v}v>DupaWibQ%l3wyfyDm+C&FwQpGZ+Z#8bs%fZ|0Vh?iG)=h-b9ST#aw^XIJJ zHMbyeBnS2C4VIYeP~{oi-Q!t3M*oN;|LwY>CJDD5#VYW{_{{br*A)lNZN3d_^|zhc zfSy=ULkqgBZg#0~Ob_PR#i+Kj)fIt^SOGatg(ad`9wY14+I-N%I<vsc(EJ8un^+;PtBeCa$6Aa^)<6k0dzD9q_ceM~~7a5`2HmE0+)cDu5$#FJZ z8b$+Gc<$Tl;DY_wlc6DJ%k~Ki)H=|F(ihO~LHqFe#HY3rA9#=73+^gj7Ajd>fj<8o z)_MurzY$IR?|a71dG)x%qz|~JC>EmUSjm)b%c{5{vM{To;*H(keB*LelO*aMqtt(> zehbMwwab5+fM|tV3HD-jwa~<1 ze{i4z(B^U7+rSS6d%ESKg?i6HAv_>^ouch_yyN|--{RM3Ygx+IMG3xF;7uo>8>UrQ z2LmBGKq%%30@T>J9YiDz+Bg-XL(S7y!>cwHyZoHMq57n=}g zF$DAplkr2}cz(Fa7*u|3MBChZma(lE46(UB7nX6%4n&SqsJyU`ITeqb+H=w(Dm0lp7$RizBYdT=X5i79 z_R?SM5CH+hPc~9nTFRHt7UJ-EaZTzmigCN&jstB5?e`AWByP}Omz3AW6z`f!3HEA5 zjZ>JT8foQ)_TTZUhWt_65GcFkJ1F;{1@-F7CmV%;nu~s@(v9EUF@7};L6bM44CSSW z&?K?ux z>#8za{Lee^UuXR9zWIK*;a?J<-aJ9t>z;kMkyINhedv5CloRUVq!9Uzf+$o~4y7Re zMB6ZIB0IyZO*4l6zpH)c+NA>klwg%ELA8-1q!nrEK{kX29u2HJe6ervt+to+`>`5fm^$xFc#aX|Q>?BL zDb@DcqMoVr(ykR&fYS&g@=M#t?@#$vZ)~FD;R=OebWtv<$2t^fxF;22dAx?0*P4_I|*A0Bs7SOJY-YPqo!b772 zvIM}B;J*RF&W~ZUDN-W>5oE7qx56{#^e?-YG#Lg z*4bl_dksMGOFX$dAe+Jl$bD%_E4^-pv!V*I_Hsl4wYyoqVi4}3xVplBHq!ZLDs=NP z@g#d-btEAQ>n9%EWeqpLmkN+_p!Lgx=+oDka~DLOvJJjT&~po4r3dd=-s9)$=~*4^ zD&(TzAUabuK=U6n$bu7FVLw@Fe|RJqzQWa(WYz&t`sma#=a?fqco-c;)smaVf$`3p zS5{kC*)~Y7p&iMR{bx4zTE_^aWE91N#v0u}=+$%42YyZ672BmZ*3-BuKc1(+4?mZ* zKQUy{y8-uNzWQ^xZ+JE9nDDf#ydj^A#nTdJ$=H6CaFz&!Oi3s!^0untrJ3`*C(cX7 zI#BrAqw0B#e&!cZ9*>Ygw6rm}{Wt1{Cb>G8&@<`b{(W3$l4z?O{HAM!`Qr0NzKXmI zL>g3K_;C$NsK?7XrtZL5ObpXi;;rbwTRIKhd3O5p^Z(N!D42r#i2X|1?U2Ke`E)@ydYE+eb6C5-)es*nP`$jEo#{zzgKA;bZDL4d5 z-qfjlnqz#JKWp@3!fwpFrg&OVfBTKx%cGlnWB7)gp6==_2V!bnki>h-t_4~J@ zUTTRAoi1pl$+sK1HqHk*BH6A2YRdY=)c@S9=h8a{rv(z}3!bq(ZcYYkj67HQ@KZxP zDfBjWs~9vuB3a=X8A=5ZuBfJYB5~BWz(L)w>`dKg-hI=WC)bK0*)zrbmLLcm{8ISoXO38t^sg=rD`S#f0{a?h= z3>tT7eeS zuW`h*`>&0$a zk$rKJz0#YRdOWfH;`Zw<^wDtm@ICz4Q(h#8u&~isgrIkcA6Se2zzyH&&sc#=+cUFh zuc!0fg~bkOAXr8D0{MJ~mA?eeUW#0ZXVPbEm?zL`m|#pCE2TuqFHJdjXI`<#Z_;bZ zhl~Y&?8^>X@v3Fze#~{`VDaT2WoDy#dly-Ubhp%zLbW9oG6Q7?NWflRG9isT8?ABr za9zFqzQgpf3%&QesfX$44}BkXCi_Qgp;VXduHSFB>T-&to~UuvDf#J)B+hGw3k@TtD9smcl7h z%InV2xuzF|6;eCpTumZ<#z#uFYp|S+lr<8_Zf;Jo3pU3(SR+hy6?-bS>OyrEC?nDO z&8j{EtdSQgK{Ap0&n(<-pcwAEzUy+w&a<$QcBHqd3l!i}Wt>c?j$k(qp}cnuUaFuH zJ3h3v1yCW!?it)QApQnQebbx<)2TtMOe_}}y9)dVh^`q6l?f-fQfG_Zt+|rlPC8ls zu{0NSH&`7BV(03ZmjZNSwDHZ10AOuMYl_8VVt4hihfXsxEQ!&Ltyeivy;O411$geE zGfjWzCLNb98q_Nog)p(KtS)w7(Fn9b<{0)~cXncLJDR7Sr7n-RRN&5o%`6uAt5=x# z@-}1yA)$6GQoSQi9KGkLvje2i7OV_PFY_v+T{B}wbJAXLEJ>eHM<8kTGTw-Gy+NeZ zgFn8RkwpqNA3r0{4+(Yh%C!T$#@MdaezuvJVB`GYu{2)}gbA%u0o1ZlG`3E3S8=>~ zlpRNQhU?k3TP1LmkoP@QQ-M;CGnjW+fr*)!jUYP6d;MpPba=$;DD)In6Z~oiEOQ*z*lA_zV=AGFNbqD6N3CIoSVZJ z(J6FsE}iO$(pAu7SG@h;?Q|iDfh;^E$x(!(=+=?gbqD4&|tv@IN0M{s2$sO zyba3Sg}2fMB}P``d}sE@ztCv)3eR}5enz~q5MamPQ}x7_a_i>ixam4V<~!`7z^RHn zJ7Zl`lX{DWdvbmvsP>khb(osIvQQsb^G=F$?*?G!JacJYv=o=5W9|u*Q#gW4BSh!$1@Gd zT2K!Wh1*WR5CBKc`^fS`oF%`6-l@Hb9ytXSk2eW(^b9LLXV3CDw@)0yU8Th^w#Vq+0N^;Otjjk2sygR4K$f7p?Ih?6? zdF|lhXHwyheGc6nv18G~NyI)ryy;;#8~CPzcYz1?KJ z?$i8*ex=+d3|*<XgG=ML|<>rbo~XErkq zoyz2*``!Oc$`FL`DDtv+rfESlOO9%1l>-HpLID^-_<Yk|X^FKOWq zBn1Sc?J6H0eEqV)Nh1{S4n1wJbg)lV2sd;3PM)4s7ziaKC_@lWWca2n(?rDV)#SS* zdThLO*nQzAwT5knLR)KmaQ@uqTnC}eEk18`rE`5X$y4$Q(reUKVb~kEHUG3jcN`-HO`Uy zuE;S{LAkID@0lUl!Oo~9x_DEAaQoXW5ZZhtU!@|xFj z#Xux^s<{9iQnPyOLHFRbbUAxTo%k)v`vKcp%7ZD${?d$UHfM^iZwSI(DfV_Oy%qRt zwQDeOrD*q4&&2`<*9zMsQ_EhSIL?LPBRlcoE1axD0&SpT! zA(2L2&l=n&7g6tfd(X@_{2bMy`Gr)Ji*QqRuWkEucA+&i{9e8~NUy;nGvf9qOKpQ) zkygMeagFM;Jb>0}Km1Np*SxfDjS;FY(n-+j z24-jYvq40x(95B%jJ<5MZxxW>gYWXxJ5D&z)rLVG%ZJ{}tp^v;T?8qNh7#ZJ@gZOT zbgo!Uoq0oy`=_zgl|Fs(QXd%)jf(ll8NKkTn{L<*wZi$Fasd0T+$_*4 zFE`uXVm7~rlZ6&pHKh2jtqfMIkdMZd=X+neMx=}pdqOyhVq}*OiOns?Th;|RHjCQQ zf_gZLTJ1C>-QWK*O9b(LgHMNszkXF`2JID9K zEM8_@t@qQpvh7!c6^j(LPG;t`#yrCn?t_XuZgfm(xiQsSy6(8}{Zm1j@^K(f^G*lG zANLCm%E~8bG~bTqEhqs-WZHi7F5H}bI-Y4e4n5jXo@eZ6Wlu;G>^;@_HMnnR{A^1D zYN;p@q^A3BHNES=T@+ZXLXUjy{Ty~%K=_6okv63I>vPwq&FP2@utfnW^ml#iLypS` zuMr&%7yv_zpzT7bpLF$lDmc+2cR^fIr={0*!+QS6Dho*s7pp53H)$(g=yxQJa?;2x zU*7ML0jLE=y9=;NJrlZ_gDSA$VrGlp9?G={F^@^48LpXRYRJRiSr7p^v_S2CVWo(A@inCjq#VT*5X5Gie$zdP}#2tHvnbH_GL$r00NWKRb zS58(_NQ2=VY8(TL0McFJB)k9@B-)^K0e;pI0-JakWCa4rQepY$z%8Sr*SbVB@HF3LIu(b?*IM%-AwhHwL4|60=QCT9fQae#pJxl|h>( zoubCGaBzgs{Rx*_j8z|}UbU*D9B)yFfH*X0X8WZ&jxJJ7Cv$M^?5p%i?vUsV08SPX zsYUdfFw*6#M5DYHR12+ok9>66`y?W?voG}Fc$VxfF3Y;(%u*mu7JC-9_!Y~pn~Il9 zz8)$XiMyh7Ejlsso78yq(&d_PrdU;u2^}(J_x}Bxxunpn#1{ORlvFa(?cfMlPVw3+ zl#7PU4Yt;#bnLOPjX&2d={9+k09LNgSfEP4b3Mu62qt&U8;tgUh(Q2OJ_}oh9XlD$ zw3q0)%?#V}PoWvJ9tTc|y}3IWdwInJTZ$~~n1G5551 z_G5Eeu<&_S9tMk;h-c0Q;SmPzc>*kKX~Yz6vaZDD+~H#o*ACW5&x|;4eeU*JEBV&R zfpzJ}Vw1#EJPN-64PNrj_~}iMd8K6%Sc(93v{9`VhPQ<@|bP#O4rK3YC`Tn)p^XJb05_5e95YyNsmW#1mw5 z?M@=>TiH%fr02+im(IYe>}HmEHMXc}!5FXS#cUyF6Q5g%9y%QJq1t)Ka62Qfhu^@E z)rn1}d8SQyq9q~BS`OG4M56CAc`?%kB>{MCA;d7OMVUqJZXPt}mU7l#il&lI&5V^S zY5Z#a2fJMgq4<>=mQW}zoV-vlk>}I>N0Zu0zv9N;{O9l|7_!8x7LcBm`a8ApJ8`tO zh|Ne5o1@QYK7JCe5SiZCnM-4AeSFCJej5pkQ;q$`!3B=YxjUzkgTAI$d)7~UIOi80 zHo7%38_opb-JRGxmoYz>0F17xg3O9D`6Q=sro@DXI~6T!_{`(584tu{i+*M|^$VR@ z<`?(9ZjAEVjNd5$bW$(yv6~OSvw2+kG$JIbFD&3H`5Zj=U8o0Zot7Wg?5WJ$ai6(r zr=CKV#HW2>$1=}#AJ|X$I4B=YKeg|mFLpFH`Z==xxb5k{hI+jE zvR$8~%}XjMyFAWAu&^B=w$l!P7Fy3spX`*Od{%Ga`yb(% literal 83241 zcmd?RcRZW#`#u~q6cMEav0LO_wMnQwJJhHSS~X%-t)@!Fj9HDK%DeWg(xKF-QL%{? zR4Hw#l}gkmi0w(+@9+Eh{QLa=dtT2!_lrAnkLxMv_8ObY^V82HpScj4jJ{WQJ>8C}J{q4q-qjhaCu&!2iE?3FrP9w=$W^5c3v9I40WkuHKB8Z zqbL5V*_HM0L0yV1A6_>PIY#g~TvGDIHeSu1_!!KppcX!vM%n7Lmisc{ZCbSBgYCPT zos6X54~JosJiz|b)yF1V1jdEM6j<0@`$SvUgrIhCUGJX4B5`Agui=aEV}|c*r?v=A z(P75Tww06rXT*Iku6s42o_b}^e#IZm@WIJ*lYAEBIe|=plKIOO4;q~zh_jtBTLhlj zCBDZ$=aPzckQL=Ft$Zoga+zb{xyTee?#Ws+->P}V$NE6-s_6ngUg`#MBa?6HqLM?S z6m^>~{7S`jZ0_hCL${ZF$8X@Eh+*r}>%LG{#hJv{a1hgh$<#Gps0kT+=`XO!r^sC7Z4c=EcQ^U>V+}l@ULH)sKsGxQ+|ANQEt6x_Gj`Pk{kb&-fg|i&dZ7=I9)8Ipa$fSvX|NiG7nBbuJ-3!j*z;F4 zz`o2hUb1p{*!S0A11G=#OoQ1!8*4}8%PRtUDera@)DoChjPIm|O3u4L#RnX2dNb{D zyV^HRI{Dm!TKW=~?&U698tkueYJb_rxOY8$Rd9H8+gG)GsuSxUChS`?F!rk8PIu~) zAbF?iyr!6zGTRQk#ntjCB~Tqd4Ho{G;V^T5#5?+;vI;@prSsvvv-xtHixD`c?@!cM zw}Vg7mqZ8KI*=M(bP8LhG-l^_IMiXofD@aXqg7iQ<{g3AC=q-1(obfeu*TpL0?D zeGE=eaV_xMV|6$;__LZ!dRm5Gc;2k(s4C-yo?_gvb%;VIcPZ?j+RN+Xd`VZWU)ZKZ$-X3+m5uKb_25ODOC95WX ztgpagYa%QxZT6VBI=CgBt_!CXdQ}G(^``Wrtg+1*w3nYbr)#^=>v9L%f&2>sDo!8k z9h$AVw*V?aKd6~bD*?eRI990#;7b_6 z$hDv}w5=_O8WDWZiMPK*0<1sj2F-NoEW~&KFet?zco5{tk56`igndRu&R1|F#Z*Bc zjwm%BMAh6%e5I>W?hF;LM1#r_c9QlYhlyPOnos;Yn~%)#;~9xF%FiBw6_%@l01_WB z+Dk`aLX|%M=tns29^g4sxrR8ciTgJ&EfXa+C1v2k9Oav@9^u$pO7-)_o{f2MMC-aT zu0)8Ne!A+Q=1*Xt4BdPYmH~=j*Q;T(RM>pdzY*jOz3;og5|QMjbY`_3dirwYHggBU zKgrfWm-MJEV6wIteV&uPiiv3ZXhbZ?(eg@F@lbU_y)DnT)2;-DSc(0JqpsCiCUj0P zb$_Ndap?ZF$$?c5EZTt)$8;+QydddSU-iZVOu?=;V<98;-DSAs< z6}*ZHyZQ*KB<*+&in=BQ?AY&%G68om%oaxzfh%GJhDdR}{42@=A9&(eC#^6jjGq2i z=8}?~5PMM{5&&$@cB(^6npHBgWDmDgkNrh_^h8Wg`%T;C#} z5S2shyo;VN?N8unLF`VuZULCiTZM!=GTM2_X9^}$ zG%hqyGQjB69IL@hrha2PnG3I_-!+BlR6;J1rnSj0s()f^>6U61#sWV(+4GESI1dbP?s zeCiU(F%tbl>CQBax+G%8x{%NmIX^Wzfs-xJwT=~~R;C@Q;iAqH_jjB+bt+Hoyt7bI z0A@fpz5fI+<9`LOTj1AQSD21S79aQSVifi7KMxrXiEdB_VKlxp@GZtcnE4DCNdQy~ zf=$Q!iZkO-bT@~2jjaj@0i4m-V2VDjAkP)by^J&Lmh7!n7YLlYEAwj~$0vnBGOGt> z?S`y;Moed9+_Tgc0lZ!h)%(d8{o=z3KUNG;FMzAf%yx=8gHPp6Ja ztMeOnw-auF*>c_y*P>#kE<#)&=4nxm+ZTl1eP`8wLo+tR7xF%4JIZTC1nKV}0spSu z4)puLMW4xcWNRSVbHxW@%jWPMNOF8oQSRpN2?t$rjvd(pzRTFNXl~7ZvXs#(1f*pM zE1!uY-^`#zt=`yg@=4zq<&D>&J(JirQMip}? z=cYK1sf$I3bR+bllUHzx^2VCt+k))IV2NVsOpcb6jj-Yq_pdBGa-tG}W}MMn!_46R zxGS72E3J*-2XUVuETNCpD7k3_hOz0%q9CbUaClrOGxVV{2pk)CrYuTv?$+a36=KM6kA$JToDv^-4P1%2IQ%5b8hWSH8W?B5DP#sP`OeeKw)-6s z`1zp8OALnsOsob__X-Bel*K+g9?UH1^eadA$9TGUwN#R4t5t7Den#Z$@T|I+TQ-kK z7>HS&U=jo-Z&f9t1wBa^b;8rO)Pn33V$ziMs#q!NeKWeajTM~(gSy1YIqUc{F$~UY z4OXS>UW?CA0__}%AMN%Ny^rJ$eni#-H_!E+T)kEIdBWuhrC@TN*M$tAwVUDF4xoGO zcgU-{fW6x?Bymp0frc);oHcPVZAvQi@s!l!pGji*&mL= z^;2jOu$sFK)l)uOxQj};<~I{DZD{it67hv-+ONTldU7!$Fj|}F@DI?8LywuUkZ1k4 z`a}36mxO5$oM!M;>E2+-?%>ZuwxF^o9*mx$c;_ovA$D*&BBu%5GoLriewBRppYPA2tb7eEQz&;y7 zFHYoX8bd*>V%X#aBEzjeO=NT}Tf!wmJ-_KLe8qUbLXe`w&bv-u_9~_n`RPYJyV*?XlId;-i8~F^M?BuXN^qL#TQ+(z0KYae5 zEEW=_qU-6!4D81WQNm%<3h$Rjich_nl>Ngh-4fN4Us;am>%Kmp2$m)}nn(X)#Vq@)A+ft_-h=Ph^*^jPws76(=S+fKtMgCzX5b6I4mH#2EWX}JK zbDUBk_V>qvoCS9HB=c~>|CCpeWwuRH)QGwgaFwN}vv|DL&_Nb0XZkGA+M0NCwYx*+ zWP}Ar(_0&{%S&mqJIF>k8s%!KLn#fI7m1hdwCLCEyYNG9^1)vs4LSLrlr9mQ+-~h< z<-63P3_Hp}P6l-W)(#&v?GM0bOtU8Wn#<`_^olh!Cr##MU0MCx7}x&r7aSw!PrfbL zxm*zzjBUqmRdk%UhGwVHDZnyvDn)%~@T};)Fs-ez6yyVJ+P}cRgIMri6w^^3~D|K4-N{;UFUs)#~ zxXVBqDYk{nci(<^7ffg;wv1`gxnJs&?Ul_^23rn?Quqxy@7%_=o09il39smr|4yc& zAP}q1{5LgYFo$ty&Fxd8cUqE|g^TFaDcJfzw)4?dxn!CX+-!u*PAnaIH$*NQ@I$%5Wo2D?&L>@I)S>SUj zX@#%ZeBA~+WlO_7rjx2pyO86qgNfz-)&2LoYA}W6PfsTK221Lu@Sxl)sCqS)#lfpy zS!oeV@#7oH{?r7#R!=5@!60YjxD6lH?3RyyZv2cz$M`y1Z`Tne*}f z11yDDe*BnEhHu}ddE}r;>$sL^O0%WtX!Rjg5`EfG`xMvjk2M{8jWGIpu?#e~ON_`rd@w>Zr z99AE%Ob)88TtZ#7FaM?1Oq)d05oz>xE;`f=7nPKi#bR}2am$?q&~$}5KF6z;=OKOW z0_Iwv^F52@1Bw1x2GHJ6sLfJ4 zc=}cmX^g;7xYs~t%&I>0Nu~cRAfa6_$0pF2W0Ix>l66p2>hvXYMX=dBcE8fJ1tBp6Tj`C2q9(;-=m96BxT*`)PAoRsgCN6U22e25e$VUEC#Va7LD2(3kA`SA&S z_#c8pT(5p&6jKP=yZ8guk?zRi3FnF<(ccbilF5Umr8ifDQM6IMvsr0Um8_Q(L5hr* z?ve8OKI`>r&Q_^|F2uWcK&hBzGm-h8{Y#ff7=^R%@t68|bQbJ|_!$TczZP4Q$%NLQ z{|sEI$o^2%4(#uQ|5`hP3seaLL4ZZSfG$WnbxXAcVCvfrSv{etvtia$4_3V!Y{mOgt`?^?#-N#N#oF)VZR1yNPJcN z+y^_g2_GBDqru;|6WspP9RepFn9uhX_j4mGV#ra&vk3~0Ajs~G;13);Pf>(RcAjz# zARJXS^p;TPDk0tfObaT3(#e=L+#?QgPM%{W?ZW6^4A}zKrxud^$UmwQhG;^#sI&1~ zKl+XDXXw9w+8#F1pE_z<>;WPiQqB zB}TIa^{WUu-oQn1SZ8}2*)wp;bZBn1RL7)CN|N+`r)ebav-C+Iwmnh%rG%yZ=Bxni z9KO3SG|is&g|IgeEcP{^6gfIu^Ehne)%YRk&~dzJh_m<8ckd1eqtyWm^}l_S|18&TobKK%GqrY6YWkvz z>5hVrj@Tg+eZhf~@Z3&Letrx?msy!y-QO^@RS$E3=opaNgV=dhZ|SbtMj{v1@-~ub zxh^`_nfxLjF^SJdJUV6R>%ygYoJSL*hl69OB3!IJGAKW{-jq(B_KuR!WQK42^a_zK z77h=$U@-w^FyQwioPUo;ZB}z1rY4rl`rvG5HKOHK0+X3U?*|k6G?e)79JbE*XIQ>W zRR^>yrf=tz2rY6SA0&kW#lq~SWQs!ymTfvqV{lAL1c~wKw`5xo?w*vF^Co0ya9+)T zuOOsp_kzkWB)7YZIW23@ABbZbXxGxKw)gH&`=ZpFbTKmS^mQ>MGXjesd$?*|d=g@n zc{Y!l&ZMXx-7zu&*pJ1_{jxZQ5E>|xGY`}_UPi445&G8ixtCv8uRFvYUF@i^!?E*Qc|QJR$9uSvEM6;-|!Vhk1{ ztRYSeQqdb`wWGg;@m&#`+=_@^BG>mf_yBhQfz*V%?!8;>NRjknl)gQI?c)@FLQds_ zQwpAY`#qUM!1GnR1;y2ju`HsM&?jmuU-BuWH`A@l#k2qCt3{JP@s#-T%4rZ^-@BcU zP1mRDZj1+Xu05uY6EHGRzBB6&W&+Q$(1BV&{Mlx$55CJ=@P*)>1CGseb)PN^_W?!1 zyr!abckejwcJmIVqm~F+zCRuTL^cT~Zy% zeBXxj^}Ohz?G&iNV(Iq6hrZ25w+z?)$@+1fP>1lJCEl*{{5Pkx-uYI&oBUSCJ@$1Y zr(yF4VrxinzxjSwd%^>}CoS~$07-Pwrw%n0aDMe0ZGKZ1Uh`wC(P6W3pGj=tw(AyL zt<2QxYxYbH`w?y(**6)L<!84TU^%$`)}m_SFWaz}O(L|3=G9T4sIP{PqP`#V`y` z040>{7pf4pu9dJTwZk29JQP}P`>_P|AoItf8!D+~TyFQ7KcGn&pyC916ANP&Vg zm1j#Q;Kz&{o68?pTN7ZAgZfXG1$>}4GkMZw;tpB(?cV$D-=VFHJ0BYgk0ba=k?yng zQ#Eq|3HxRB1FR|o+T@>pLgTVB#qI0vA%(uP5@L5Jn4;zi@{XENb~2>tLQkKCtkpAA zI1zI=73}f?m(J+b@bmj^H|V2jr!LijA8uXRdy~fnDGzq}7jDm9FZZdowSCOQGL|vq z91XGEzFcvm(dsXPUvHx$SKx_gwN;I{Egu5IxO~(0S(Ta})Ex!kRTG=Am)G6A+7Z(T zlY>69=yw5M-&Gv?IW*QTAGuB(znI!Sw^7+C#F#txHH=2f^N4#;)8)UqKjSNe%vF9m zbARWth4)01E=fyS{A}yzusd-ifSBm3GJ;i4?liB1%7fNhJU>uI_ zQV;d(SGze?y{hxWhMNRXnMsz(%|T7P7CSa`KA$02I!0)P=j~WeolUcQg9=ran$^k; zjrrGMN|ARqD8BhYPt(@aUX5Va&X$6Q)-j7=h!~6O+KS{~VgN>(zxcgiqtrkUmh!u1Cz!k9y~J4PaBgQkMvYhld4fu2HmiIH&+LPPYxkgFf@@b zmD(o9z909`Cu}K#76dL`eV(Rp)N8V9Q=Y(tgN*ihNboq_>PS8D-6Pr9VYItDjx#oc ztIV?>p45Jt3Qqlc|_h2v`yVAUSM~t-?_-Cp|69zs;J^2wcSkrlXdq&PPbbR+4 zS>YgBrtle;pB8E;1+Wv&hv_s=VE!u^wJ|qZL}RBI~5Gp|K?E9@m8qs0}4zAxW36t zTRO3QMUyg|;H<<+&RIQ^;EhjY^tY+* ztK+rzCP+W2T1K{0>|Y!(GAL-( zG!{WcmoN)*;&mhZ&_F|tMx}BhvTw1fdZ`RYX%QY9))u@B;XIu{q zvAeet*B*&XT=Nc2KofeHU`;8rLR1UU?XIoSTP8ZcF5DoV?TgxT>PPM>Q!Vrp)$#iU z#0{8j_d)%RSHa)f(r*6yW$oEW`L{!MHmSm0p$C7mx(doa8(Hv}_gYEi>i5O$@sRP@c0}FwTN*v|9c@Z$2)7dEz9SH%ww{D7 zYn=epI2upEDA;vq4)gXZsJ_qJo89$zymxRE8B9KsH0lX}9VH|4BnK|8*Yb7EIZK!9 zcwl#_nsU!(1wGAgI=x{F#lg;=$tvElu4po`H{J1dm7mTSH4p^JCBB`p&#Q0~%oyFc zfX%b@tkQhm$x zyt%a^X~Y2mhQEeq3NsEWg?lZ0sU z!t_{%RX(*amV8e-6C=@tE@2$zZwjPHg$9c~v>{P(&{|UN{vhHJ0vA>9uvtiG5^h@W zCkUKMFKnTlBrS5#JIU{f>bT^-rnL`*KIrZU(s{i2HWA^|ens6^Yjr;{V}urSUeqU~ zfKe)fWUjSueQr)z7V&sMl5iTAo_9C*O_(e8)a^$}vOjQ4_Hw&f$Q%#$S8R zr0*3HCM|yQUhX&j#?NHCEDvb9vX(_ls74+qmsvy)*+1weRyR{v3UhIzoXS|JKvMB?uf^4kBWC-RI1Qez^A0lJ(@}il!7(Y}7f03J zuD%&_8L!lFyNDB1@!d2X7kInC9dC_vZ3#fwv=s6J<)(%$fdo+3eJ|}sr}z4{i-}SM z=F7HkJ-{d%^e47Rl;tGtnAj8&=W4oTA6996{CaEjH}MmeuYlZr`uv7 zZJHL1U%Rz#Gek*Px`is{8w0Pf3ZrL|Lv1$N$fR8S7gKe>Sm?-rJW5U59m`To)JQ;63j<6od3Jb%EcU{kE3X6MWb3mRArb_u(@KM ze5+1wsA@^u-!NA)pi4ctkZuIHo$c4kOOF--)*C(=$e$3pVwAQmF^abM9bQc;&+Lo0 z#WIi%RGz)#%x1#^Ub^k0doq6oq*y-eC`(}Y#+Q=WAT*IYE2XYV!jzk?9mS+;GK54v zcu!zZxZr~AXeAFDMwVxV$5uvDNITyVj{mR>o6oXf@oPN81-X%b5gi#oSD%qTqXxBo z8iqv^U?;YkDj=IFA4zdaOa`k#eyD>h)Xk$IWD$<(Yll{Qen>e3mDazIno*9HK}qTpjXlWxegD9K;K8-aNl83r5JV(C59XP=(q z1sG=r1i*pGGmZ&Nb)EQ?UXN2Dp^=MgW_oC)%Qq?mNC3J)Vjv2=5!P$u7W=1Cr?=nb zeR9HBW&%jtE>eV;YC;>n$2wuG+QvYNJS!^jlYzdiKrHE! z=zH?zqdlo;RRt(#pc6@zn#Q}q!c2_ujz6R`iA{`x(*31oqUnAxH9+T4cYs+m#sjb} zA0wnJUMD{{qmqi3VTfW$RECQ%+Pidr=sCeAE7g`4dggJ_PW`TLW{ngH5a?FMqYw}c z&m2=AaK^IHy1+btu%I90zeIEbr2I<7l}1#Y;#R?Un&97pobK%LpO;b+ZgpoD+zlTH zx%5|G1{eNUTcN|NLm8&yeB{sYC*CEDjQ=AT{so>A9ohQ-@6W}7LZmvs(|ujB7R*g` zlRJzp?h=%8c!1--e6^#(kDr|H*X7a34pHpfUr2CHxBcZ?u6is4Tr*Cz;M*shz4`L% z`$>cu@zR~&DF>PTUrKHv)A7X$+eQ>b|L5U2tISu^XPv*r6XaGRs5EO)Kj) z&~}ylqMPI9IKA=UL-p)Zq88coA7tj_zK!0_pBT9=LvqxhQe?S=7xEr3d=of(YX5GM z8g|REfX`Ry7;D^fl1wXxE`{4Ju9)mPjb8SL{Gn*XR<2(SMBpC1q$~;gWNd_Li793%QvpDkIn5JGzDRohm!bZ zYuNwH7M-a4uO))@$XXZHIl^}{D}F%$`x&ugkIjh>eOy#`RzWO#R>CjY&lEdVg3wt# zFWLD5t+8_p%Uu-Mi&rp*3mC*TG*I^rVPQE)0}^M|73&b)&LraCH<&LsIODf*3#}36 z9_E$KpAOJ!=iME2zv}gr)~N>LZevgG@YC{^F9R3(4t7bV9wpCC@^NSBo~3az;98E* zSh3ScNIpy?^lK z!AK|s7-zkZ>rSZ;%R=_yxluYlLijR+j&5TuitD7yb@OI9c5NIxGAYwK;TP6jJx_F( zr{~cwQ$HRx^97SbwaHtC;k$11LYpPg2s^rKU+eX_IcqAVreG4>nUp?uZtHlEPpq%& zGwbHf@Ar=R=oG5O3g!d%rL1P=bycx!I(&~*wsH##7U0{zysZ*$zo&DEb6o%_p7l`* zFP?`h^<%;egG0s-@E*H?j+*TL=%if@vkJEM*Q%iT+ZLI+Oj~;*o|&#PMRolXBYS!E z>>Yl`JqPIGtlIOT>Z`zyqvnOwW0)-#5c5%+dK|i`UNk-FcOYih8{t-4h5sNX%NfE#ctS%bt_2 zEnq!aPQa`o^Ogi7Ik{){8mJ)gs@d-q+S!>Wta#q_JAkT@uRLmmutv3EuubqZ_Q7 z?ohM7K8!>M+Qr*ZPU6*NMGtywmBHW0t-FD^s&vLzhRHDx&sohInx%P`3&dtUC+9c( zvX-qC>B{+R;v!5zclIP!}`D* z7bT1hwM*JJR~l56-Ea5}8VKzFDZon7dS6ZUQ&D#4PqJ5qfi3`MwlaE_3#s6SL>NIC zk?tRNyU$9!UnSC0JRqV?8Md18R&PX7rw2SwP$pASPJzW+z}HY>CO0mFGvmnL5!3a) z726LSg}@d+NsKs=Vs*uP318pM1i3go-AKAGzniL7#jXe{&P*c5{$d;vZp*&*uxv1H z$Gr1sT93*{J!!Jjj&Nk&dpmk$mzt&N7T&VY@G7M{F`u(kMLCiEVd$jj(?+^{LpS#F z09Y+yX1`3KQ4xCo9O|Hh>D6W#nezZ?{>cZo>0Fn(Jeo)E+p*$y{6q!WMyO-hP)yy5 z3)B)g!OQ#WHaRpqAnzH?ndp_0c3H6`aMaw+O(&+%hqqXP4m9aLnkJn@y}IGSU#kqo zEMvu`W`}j%6XL&Ipr2%ZWu4I``$tE_pfC*Am0Kq!#HFyf1Vz^01X;=aKY%Pwshw%` zp3qv}gU?B(Q8Mo`)CXVvLvzqASINW%a+z1(AmxjGaDAAQ{J`;{*bHLGZc}-dqtA92 z0U+C`V1>oky6GPT+@Sl6@9^A-qXBp$ym3G2{ax$P>=bS`lW0xJ6C?}{5N&(k348gS zZ7k#^-78ZG?vB>0PLo=vcf&UQa7?|1dz#Bf1iZ+MU??WQDRqy6Zj2s6FFXz6D61}} zCq3*THgw$%_m#U7(V`k8=gUp!t;2=7Bu8z};yGvjp(I%qdRs^lnHa}~TtUkiMLrG^y($Z=~+=L%UG?^o3QK4?+?9c(p0F z!A(XJG>-?{?cSS0z&Ifzm^46Ys;DJxm=*HcSUff^2E2s3;S5r|*K_xLA-mJSU5yvr zadWmxpy_*5f%>g<-FCIn5O%Rx0L3RSO)y4!PE0A`SH%00B|Bjf#(7X(;8s=^hui`) zIAVl1GC}+@#=nJmrnaj}JoB}T85@zM1;N(3!%nsGUWjmA$ZTrWR{E!yOuF>dv>rCj zNZ#t*dNH8ul6x)!VyY$PY%gk-N>6UpXNT1kfpy$&fWn+G0G%ULD{YyNTi3eZqOybn z)03L4`Clb4B^rLR=3iixidBwrocq=@p#PqG6p`PIY_lx!RSxH7SpL+5(34Z~kELSv z&ZLM}dMozU@yBv+MZN|v8%;Iz;=_j~(-CHBHbeLXn_>$`pY>HlENe6hu*qP?^_{iq zaoDNQx9T7;)x{4_z&EA8!C3JRULRFkZVBxUDNwMK-m~A#`i{sK8_s<@=g(;z#Kv9{oUn|_Cw&i2iA983{mp+1v0 zqv8mbFV5X{C*~=B3JhCGcw(?GxglkUX$pu!=M^}7lD2Y4kK)dxseO;7uqleF_w`lx zxWD_xT~=sJ2=^QZ9X!KXn&`XQ&8qv89sj#YO$;IP*E8mwTz=%z_X@I z56oZJP9mHC=y!D_Me4E3MhO_{qR06-aDgyNeoZ#Ool(5keL+^g)H(Vu9oJwsX@l5G zox8@Sq)abXPe;vzG+)r>f|;Tr98axUM*zxP+e%^?#~&K+=jr|4Qb9M>bH45;wx8C{ z9I+gY0k?DiTH#$x5WG69>(RfvYJgL_Fs$1ja)M>GO19+v-BE?wYlZA)HBmEms|4%cCe))5bLVv#X=kU;o5b?iX`1n6v_&-l^C;4yh`8Sw| zE2GN^f6o5C>`fQw|6ZW*tTN~?*`$9ybV?BTfBq_Zo^D6~{oTK}G5-JPZQ^X_zQFij z(LHFneAJ>qEBoAr3EQ<^K8`VAcV7Ff*)w#a>DbEG2%Eg-E%s0KMT=7kITf`%lCbA* z;F-0)7TEd?O}gCGkA&G<`9dZ5FIBiSqS9@8@RJ^6uD2c~T)dl>%coFQ^Vh|qM;|5R z4YXEg;ub+4V>z>$Kb9fPB&RU>z$m4>c*l`Wchggmb3r7wDm!cv|6;Rw#l;F8C^2i+ z9Tt{6J9}@+5?zlTA3@ck$62~w?)YMB`NtjosU%9@qOQgI9jfy45V_9lUuu$PF?_T! z;fD8q*hizn4J|0TIG)32TCx+oxT3^fd}_JMpwaO6%t=0_M?v~^w^4*+QPr5%?U@+5 zBljcQ_|VTEu$6P8@Cy}p=?{R{={zp=b8E3=Cpjh{g{6KuTw1^E`CNqVQ9{~0T?W%_ zyoi+%K~sbweQOaJyG4*t8$y-6WYKyp|c9qH0SrN3bw}0hsg_e6Y z&o|bjqQDmX6;Bc%YR-^MamwtdPJCzWB@E1{rG|BC`d>SWy)=4x9p$|(^fq7U*%!$nx64C#QYW1^WF8;-u*vok`AU=BszA0Ls4K7{7R5!B(k~%LH9(k z!$23;Z~Hm1nikcaQ)oXRyV^~Dw7BlLUrRKe(Si9`tiu<*pyCiQRe=C#unV@CzFF$Z zMUH@*lb0pu^O|BJ_=^lqo<>wi3o#y6&* z-lxH%4)ca#v1O2GFyhLHvqYDh~phU;^bI{{UIkV%th?bB7 ztpm{X32NmRN5nzTL4Cb6?PY1pL&!?MzfwVs;;Lz?Nc)KwN$R@Ybndg<_Caweu=umx zNqTzhfP#)~>FSd&1T|M{N8lYjG>&TLYFjQs<|LaL{X@$WBw?&3OQ>PuQIS?t^tOsY z4$s)e94^knB*6*m4+rIEzrn1vVc0L!QZyi31Vny_(t~(CS8Wrl5=Wi9fu9mpdA*bY znM!1SRq{_XN>4*E{L;*ks3%rG+TVju{3nd5t(=?yJ4#98A+rrZlzX4R2712 z1?;xnp^6U@X8?2+eh>?O32G1o#(8}I*X$JJ4a!ov6`z}1`Y3&jdyw`VDIBSbi?XLK zJ;NNCH0OgNYOt?Q;Eqz%EH{KbN6)#j4KmP&PP~irtcq0AU7Zb!!_WNX%7eMPQM*-+ zF6+W=j`ONHv89-9mxxou#f+d%gzWxVM@h#ZmedS)b`n79Tpb~E@NY@^og@Gzy6uXx zQC6En0Fq-BQaf528BDiX^7c~((?TXSg@G*^xI&eM@I16z%Px9~PZ*EVD`u9eI#ZjT zCPU`}R0jPnP1p(pV>w*Gy11Qa`tyaEM8`+^hB2-1=C7c)=~F<4fnA&^5EoWeK0GE( z_XSo5C%)xkU2ux5M_DEYj?wG6cuR9`8oes?_ky-sEDNos}=sr?^XKTNzMOMGz=t{@p7}Wa z3079+mXQwEq5Bp$PZ)W!yBcmPpBUI>8+Lq8(D%?`#)S-HmDsrKk08a!*yx$5t z%AVz;ZiY;1>(!s9&r7c}YqqD(Y?6j)08x+7%IC-XGFmSBb*J*6d2=UQ>({z=yxGKC zUI?;Twfxb_U@Av$l`2Rwo|!%wqJzvXg^oh>=qDJ^OU&=mcJ6NqWJR@11cwdsuq6*M zlBRzmMHqL424Xx6L;b?&F+|@+0cW7=89Cy{V)}PR zufqew_J-9v)xS&9cT4qG1u%A~%^B~s{~)h2V&3*oX@EZ|+^tHl{7m1q_z&}HnQWEl ztG9|QuMCy#cg`hCw60G3k}0XfT$gqdoD5j173QSR7Sp#*y*?=>J4#c^JNZAZEvB|` ze!9XZc0^4Q*B%PVJ-i$FRbN(x5gXAme@?0j12M6aoaAwbK#8@hgLL=kfwW)yj4aj4 z)`%`tWBiUi!n{9^_`<{4e|$h6Z7-N0YIl@f5VCP+p->aMmdx1vN zdp3$~78#HyC1yn%*3y;In1>IIW?quVKqLUfzr@TmEB^MvKUb;lqX>Wrv7X;mvo%xU zOVkr}1Zg^p?5%$=f){;sHgS=zCn!j5HD{p!MH4BuN;V|NS5J4>OVR2a*0)|!6+!8R zYt{7{dJWE6R=P^wF~sE6%-SysX*Gk6K9(~G-1*=2{5vbP~yv|Mu(u`(6fHW}U1i8#sp9Y#jp_4XUOmRKg$ ztEAekuE$LEkHT0z>?NRERGFehx>B%-`T#b?-u8Kauq!u&g*IS`X0C3+z7^e4WIlGTMha5z4wsdp!I+%m zf>S>Y

v?yJ*NPJPTURsHkNZWT>WI(EqW%v>Ej;QTG=co8R}S?KVyu;zVg{2t%>_ zC;m;p{No35CZlir6Eur00JQuZOmcXoFXQBDM7`LC{J-W5#DDPG zx#g!u)-v=aHMv916v!NZ6T>ezjUBvO*%F@VY+3t)-B{i^Y#lqfgJtlgmT)bWpMDrw zZofA7#QYO*N3!*9dz##i>KO;``3l=bcsnYg;Il){K%$&;PBXJI-#jVg-5)&sBAU)> zI!ZUb@uiIl^FGc9cNcxGcDx^J(di#)G5jNARsoi)dK`AB0-L-gs@k@8(WuEmH+MI7 zm%<8cZ-j+q&Bnu|OArnIk4))%%*W8^Nj`yF)T+q5izs`0(R1KE>wewa^{>!k!B_ft z44If>JRMtb{Eg(p&;VC|>fkfhW3}Ullu``CF6KiR;o9#!CKdPRk(8l?R$G{;>hY15 z;%snn{RQLvMAq?=or{gA?BC0Qfx2yTu3Uw_NmrDu@}zoq-{lMkDdkf)YFh6&BfjZ& zoD}jlYVQ@{RJh_VV;mzn?Qq94>*Yi=n&NW-@=+=yu9nwaZ2NQj`5`q&tI& zou|hnI4~aOvsmslV6(a3->3f;6>jn;` zRe*WmK^eM9F@Aa(rhWzW41HXHPn8Ob*Ah5nuj3Bg0Fn>c=Pj(7M}}L;lp%9hZ7cgd z<0VR%%fHOomkDMT6@PZ%OEDkopESUQgT;^4kFTW6CMk-XQv~@a2{gEho6uc$OS$y_ zGhhs249!RHQM2NCOq4Zfey}EYd1 zq1-2~hf?#&rg(Jqlk}CWo%FMw{KN5JZ(m31#aKPuG=_9!Md(7UY&G z>%z?*pXB_1*m}>frn)9Zjc(_Y<$1-z3;iM^M@a~!p>f^)}EPr=AL_#|DqrV1KCu6(P4@I zt#6M?0C|V{Vd(UkE@9IjBQ+BQ@EG{!P09HprAy5Ar9|mZ7kP-zd}oF$9n$*6I71pOkWPv8}JD#L1C)9XJA4j=gpQoFv@eoM>qcDI z@t$BN%%(qI`%)#bN-eQyO0-{XdYr-WKuH>y>>oNkx@nEtX=rY3dsM6+5y(c{EmHq7u@!xa#o)`-aOoUi#jAUVnp3XlOSM$BC zBsox$aA}x($B)k+kFedR1M46)@){I^cC%w>;F3RPxO304$|vloGF=1A`-&#?O2@*` zCnG2uCF~Q5F4$7!dzch%J?$qlzGswsSHG!u!im!~J008_(-ZLxZP-V;BKFT~NxD37 z_rij1QoWgdFm1=ma%XVgJq<`$uCx9lWlp~5^4gDi3L{503Z&@_MI_k$qA66l;9qR# zk`pj50OSwii+V=iW89)XCEQs#;S{Vb9#R7Fa90m%@s9As#Mp-%&78*i6K>N^E>8WH zQ9h?#0TD|Y>5Z`VZ00 zAoP;bp;(rwcxovXe~i0&rw>7uwZbS|3{^j$0i>}O-_u?`hA=Njjr@j-y!9-95Zk=E z3S#J&K<0}J7HavRtsZ6*!C_S%!SIhpinF2oDo&>}(69Uig{1=sIH)Ibf0SsLAU7u8X18LUm32UK_NV#A`qMdp2)q}0{?_pA{7ZD7F)eMu6S3UodE5N1tCT?6 z+wWDy9!fcSms8@>@_Q$a!ZX+(e%yAuL$ImfbpFW*oz0C@3pt)au?K!NSegZKRuUTx z8}ua)({PgCRzL29Q)9~0-yA?SDUr_UI}JepG34`bSF16>=qf+k6@|i!0ItDy#su^g z7SyMK0D?tdxzKHZHQ?l$4e3?xzW1ShQ_3RB!^mL1#yZn`1qK{v_FM~*M)ry|?jmhq z+*@dKjAtfXe-GfGaWPnMCW9HCO3>X|CtRssB$fI*DXzwfL*n0C!Nzi|@+i#Oo;z|P zy8=EG5{z$B+z+d5i>j}!af6j9sR1srj5MHYM{y9*|zovN# zG{aKAx{)6vGp_eJ8x%0ZLrsoJROD^T<5eQ7m^$zih$ zXMVQ3Z#`R{M$cbI^`YpNYuSO}H@w7GE-N?h6_B@J0bS3d0{W3C1r8%dAdQJO`EW#$ zZCLqv_khavVs!nmAql9M0}nZy8dkcTb^afh6bJPFmfwSwm*LyPWqkKsjBpKG8&FC}!8>tcHc9`N5A(vdL zr?|rL34eCcfw>XnF=vGc3b|)imOw5}+e1hBBYO7aurV=I?~1u4r;CFdI@RFOpm>N9 z1R#2$d#7>0i2^Dj0Oczp<3e4d%ahipjs|ThqA1b5uj^hD8$G`0N7fK{g`I?<-SV6Z>`)2 z1v`inu@7M6tq$@FVOt_hV=*A>1nAWvlx%Pe& zfw3^u`$7K<%A#dHETH*hJ}QXN@wAEkfD3_=g5^RT^4#WwGo<$3vi&R(t^zlM`&xtQ z1c8!7%T*$)p}u)Z*1_qm-%ZnK!-d8neci^w(BPiR(f#tz{-@vQdE&_nNNj4UNzH2S z1cRedY%7=z8ZE?MtD`PTv6WQ3^(jW7wJt6Y`gxu$$e# z1=qp0R`R@A47*?Bf(Ar2%PaFAZhgy8)~lemq;(f#)rI|p%PVJ{%*6RbTyK+5_Dd^0 zkjfc~JHAG)Dwkm)GR4M4Xo{cq6{jVO*jFd6BahL|IZC=${1t9kDK0AfQo>$lJEgU} zpLYA`<7y@93NAGkL#&fFp#DM5a|{d``gP4)1q9I+xbP&hN+WvL6b8b*)W1-K^LL0C zQ7JI-{Ze4(~p1AAJccI;M5+G7eCUgQL?++a;txVdsx6$_UHW^O(o%Vj5NXb z0b{ewboRn6l{v7hGNGNpKNqW!2Kf`wI#QZfXIDkNbI@+qDR&A{vnn{gAnkRah9A2F zPIEUb)UX{554ZNs_BI#?rf;1=g^)C{AsQsPg$_t!`OWkHxK@e_Y$Wz9*%Kw1QPx@& zC7xoT6Z$f+7>oKfxEGc2NJ3Cqqla%IL^}4u6Gt)pg;HG-jp+p?^Jr+RJXeMtj6q4aPFXu$SgM@o zhE-!cY4ev>Hn+YK-0cz6$*Q+r5zG8dZVj0Rt5`{9E+|L^Kl)KI@tJF2D_MWRxzfyi zCG_nlyiEV<*`hOL;F0=;6ZEZmq0@OtKV9wSz|&Da?0k0AI6{bGwuuQUoEPg093y#0 zwDkWhySYy*$XWzwcHH(`kU8g^bEy2+pWW`#nFv85)SE02OI#4*#!d|J(8&aTD{meO zNn{x^CK@CRl^A}WnoQ7rd#L_*a*B=q|MPPmMzP+%5(+_W5$;pX$vcli&fIvIK+j`W zmm;=jyz}1Bz?+@WUkRchAmG2I9h~LmQxlbA z26jcnktsKJ7xC9R19r(<4R*2q>s)+28bVF~E;lg;WR-A*3_n9sTjQm2<{WJ5)%Yj5|`f9sdI}@jT z`Yw3~o}%h9K0aFWT{26VTgn8;xF(|_ zrx*L#&$E`FITF2-ba$~A^mF)o1@qn!p)@hpcm>3ocanHJrY7#$cT$p3=^PVq~2di`19S&!n1i-MJa(FQq8)lZ)aIu z9M2jS4?IohyNI{H=44yFZRFa&ThVmnQ(BmHNxyNDWP1FtO6j2W&X=2=HH1yA%x7Zm zGlviPgEAJ_f)btyFR}HtS34%5no*_KsM)2~-X6x+^R&4CdP;mApsC0LXzLgff3kbh z9?#$j2iLSk`&99m0m|c>&E|0j;GE!zejYhKEK{sN(c)R0xb|nukfy)>uZUhGtYGq)^c9~Z0oU|&)IQl|LBuC zvut_qFR>7xtGaNf4V@O(Bh?lSiVrfq^@3tU{>n0O9Z?2bOV6%7|$>`8y7&_5X?Bp z?U`35ozFJjdbg%&Fq;D?aDg?3?TO+a{|!qY{^ln0_OQ{vVI)}L2A)wlh^H@IOq5SG zYh~-+Ze*Yq?SZ{9mDOcW5|?)+1u7k-?+}o8dFfQw`{i9Dhczs{7>*p}G*ufCdu!n2 zS{KAZi%P9He0$#sa1}Em(OrW zO_6?ECUzY)PvU;(J^g#0Su<<$MvzePFCamiEOObaucKJ=K=kR%smT5l8ZatA z(xFG4H}D?MTM_<9bduQLJ>IG?pBKGIGUQy}7>J#PKl{XC9D|2+`0nk(B2;9p6knwp zT!xF!Hs{R5D+?vibnG+vXJV~E!mlP7#$&iO9t7xpe0scJEa?>_pBvV84WdEXT4CO za?IPqUft6E;cpPt3|*yY}r{?Okc@lX84K5Y2cD zcdd!{*;a!Y&a+ln%-6n`lCWvA%zFz*{YqeO^?@%eRNGxSi=7DSqiLoNMgTC%H98cI z>}XB}WF7=qPn;Y=v~n@eC?$$>-YXlO+2hj1yp(9DrbSxD+Bgg7Ew^~Skq@v;$1G^G zLFLziGC&y~8TVR;KGruYtzZ=47GcRh!9W}054eiQ7CIfb;IC}*ZSZeAYP9$tXKlyC z7MY-EM!{E6r~(v}$BscuZ80KNq({UDKY#iomIWgmT6)}JJs)tNHNVIdX2(}b_?jhWqf^&QJ3xMmFQeP^ zyFOx!i7aKr3212?h8|s$T0>C{U6Z%KBzLw{*a$>qK(-PHO>Lzh4KnvcdRda{7OC_@vR3*#nBS`$B$k(|qWagZ=)pVOU zT%)KS1ZvBmfe-R?fK+J5WcX&k{u6;$H5slMli=I3L8iI_zo!BtG7^5AUlRoo>E;Nc zXx(}&IVkjnvUwQ5*WwNbG&vgPud`Kme{31xv$l68Du(S|+-Ve98EzZN_R&+%tPnWu za)7`ah-nw-2hFvnK*7;LD69fp`up#f%ixSwn23I?qYzIojb_Se{0}#)< zW*gs}iV2fx-K3$LhVYohj-_YU3Koy^Ovqh7(;#~JiYoy9(lvf>o7aRbjlST$N?=WW zPot$4qCu?L`G9muM%e>^CmU`-W1$aW8ldI<73o5M>{{M*MBFj8?nSpC7q=;nuUQmS z=w48VVp!v)M@rewRT?H}Zvw+Afduc}q}tXX`5?3`KCp^~;Std9y(9j{{!RU&tz>rn zF6Ssv9Rt}ib3BOXQJYnC#npZUstwCo^}{JRP5j+jIu&U^0MTO1fORpHk$3VOMDp0R*G!9Z#=5}BUktPNFVDb_lCmdv>j`);n?TD~K=d>(^?Bkd zI`5pv*|h`{pqCD_&5K?Oai7=hqMfwt;M&y+DbyCoI^fYto%vWzFXojbtE%*!9pCNG z^PAyEs(e-e}BY49^k+#{+&>W_N)?^rqShG3hn`>AI#+SiD5fam$0kq~75RWR_G;l z?{>#qc5sT}fGUPXRW1$v;xx=-aBd`Pv-6oa%ddT^5pz}l(i>$e0;)0Ai^f$qEVwPw zJUF|wjEWiK$>U2;aY;8Ncrn7vH7U=Mc-n{&p%dnawiadK%A_WveUm(?K8=8MJ2ndT zw(KAq(VfOz<-@3CiZX^jsv39j&$3JWo>Z8r;T_xu-wxQCFUe_fSSp>q5+ApSvas+B z79t7ByS6?1^`hx6Fc7|&w_$D4QeC)BZs3@P>sK&g(;vex``0&7G*|nbLpPa=8JlH; z1HeIvKuk1Rusrigd7Pb%dKs#I{^{gmlw9o1|9B4ookOA^d}FYQmyi_@x#~c@1!m4x zn;e?5l`oXmziwlkla>y!Nn;+o)_!`uD`dFLM-51FGaT8()}D|RE%;`cp>hC1pN+(v ztKzLC2I5cVRe8l8H@2U_7m{>U!vb=bSCkwkzq|%6G9YQgk!&nlA)vujd{yjb468L2 zkS~i~`)}5CQN+F|a%HejtCv`jpqs`rRewLYxNnL7x$lmQu+I{y?#}tSOPf2lCXQLo zM|mw`*mr!= z(&wB=lx}u&ir>-D3(L`Zl^LM$^izLBXwYU?x|prVb#>*(UfIQqBTROsqY^v zrR6;dArYP3eXElwp_+RYg-7*>N`mi_WgathEhy&u?=1Q3{2^KUjF2aw-QG$M1s|0( zu*Ynn(3WmRiT~Fe2R9@r6Z$|`LDNY0Br|Si<#%u*#|AS-NQ7G8m46z$YaXiW|ThlWW zCN;30=4J4CcI_X6Pp{O3Omp|MBfr6On|X~WaCh5lp)=>Rz^Zi!QJVIhcU}^4KHX}* zF}IXmGb@|MG(EW5_OD1=jBJebh%C2N^qk=SgZ;~of`b6f13<^>%6b6z&ir2^ioAcu zqDn&#GyeANK<2zmZsDYAZ9Pq24v15xqU9s)>&&KR1)y|mDQ61fyISf z4G24`|DH4N)}$TSB^_*e77*CByI?msPdHQ2QxWdVzP^#VA=;}WBT7EmLQ9yuMAozL zDu^G?K(K<$;IXIoFP_~2b|)2kCYL!)_bFrI1EO*5!k?1;y~n}FClm7X=PxF@-Hz;B z=jt5}tDjcE0s1}leLZvzj?Y2~g|{RK{a}b7Cu_M(sBVW~#@L{1CYtC1yJHEwI3P5Q zeh`LA{(N6B4pN*No)tvaAE;!5fM<}kfx%u+_3yY!4goDBsY%GB|Cd`TyjEmz8?%&~ zYMjVBL$4nxk!>}Ws)}&JXR&7UE}>axH$H5`pfDzI^`C{F;uDqR)5q&JdhjBphGXuw z*1mB`MO7Q+9iacr92~ZbS!6eF`$h%jw8x+syVGUG;=5;0r5lymY3m zmJaxq;!NzuuDhSOt@kU{*Ge~6MJ_ZpxUP}}*n@9!-HgaUG1UFJmn)c%4MIm7+hXB& zUa$GSd`}Na=HU#SR@iA2lL*r!8)U>@>M`tjZO6W3!9hGlam^g*OVO)XNi>TPun`0q5_n-Y+Kdf?{4TFXv z!*K%#YDn%A0X2-j&1*dkaj4f;h1(L+oe;{VMgX_<`w%9Fo3X3F52}CSdcWOjkCPBk zMK78{a_q(VFG7o0^t(&B68QP!`J^iD-JWXW{+rE#&Yk~mjh+R!P@8?D+dDdY#iYyi z#P^g|+Kr1CD`6rUg#Tc2L&@Xv*`%7Kt##dQJtN*UNjE#q zd^QT5XqTQk^xPDH+p=d_jEli|1k&Mj?^fv@-YDzc{&E=RqNum71;1xuc9v)%9TBdZ zX)j2_sxkaX6?Bh?;p_lMkUM2i&X=}wi_p9T%v&G|Tf&?oIpOcY*7XwD4A>(v9Vc>fT( z81M=MU5{aJCHEr1MiY2q7L6&d{w3^tCmRpTrX+j}m6-SNbRLiXIB~_;_jLcjH|U4K zr=dAddwIjb6SQATAHB;hn?F*<&I*CJiQSUCQMr>g-7a&BwJcd`hBA+)@f(qU2Ex_Ip6L%b ze#rARt1a=Dogx^vwX*b{InIUt@SI=}y0?U(iEI4X_KToO_Vf47v-KkUS{wNE3fyMR zhyx*Rj!1iip6XTL;2Y#w%b}qnB=K_&A0fK6levaFmV(9)dRgdp98)8mBYlL-F)^5= zYf4sQnxDM2B-Iub_L=C=^Q0^l8r3^g=s^dsWNK)>G%JSe__KwHt~BLe%F;XFSZbGJ zOI4MYEFv#Ia*3lAODd+KH`;vtsd_SC;DASwA$MJf z$V{CLRkUBZKkDvt^eKg@JGZdXJLA0e;2+ysQO7`la*{?ajB{|1W2w$|e8TNV2=xZ3HO>fwSP;GitsWUuGze&p^7lB}bCF%;yC!IXBcIa#B+s8K z91)HSAI7*)^|e&ECgTLB0$rqOZKTUZtXi`OC$z6a1vq2aT=KUAy@D~iecWq~<4%g@ zIJh{*UU%#bvEjym!BIf#HA>A)_jGS|Z8caK(KtvsxhLFhkZDMTl$FE=NP~T7B5D=M z-Vyc7^P&SWMAO7$lta+j0>lUc^OE&`=#+ZBwK*72RZG$fZ|SHEdP4MOSd(64$I05u z?@6`FrSkW6%41dv71$JM;dMHwbJx5F?bWTpb&tr(c*^+y1{a?H*RuQafO!1oqnXVy zw>J1F59cgtGvD`3i z2&rNLolT$*Y`VqM@m<;CjvX0J0AsO_=;fCE(zOw3U0l<^Y-4xB{gW^fQ_@S__VNM+ zbfoH2%Vk@vT&8dE(O${SBhaYdqc06Yon{eFy!DYk1$3)9GKiY!_%mRg1E^ zppsHx$-<>@cFryD*&WZp(U(bgYaU2zH`vIDQTI6Xj287`klK%7OUahXK)Jo|lGP)X z?wIv#nS~hPzuo2-p%Rv-(ydoH<6To5W^f7Nd+Q2?p*7Paf4U~Cd3QC`!Wa2~q6BQ# zz5fGvI_{3?>wdtp$0xJu_#Fql{HTHwF?*I9R=YPN7 zu=!_W!kb>!>SCU3Xp0J?*9(;q1Pi~F;>2Y53q zf#V{@NP|Uyr&*x!*I(LS9e!Y>SSFi0zS^Huv$a5fy_|WJ{?sS1I0qliYILCIYwhWQ z4dHLV2mVl$Q_J~@lA|8Vi%zFbBZGueS31g3&VD!!{_G=Xa$4vUm7~xj*Dmr+cKyk? z@5Q#k#8jMANJ!H=6%!OS*jRH;N02bWJtzx z#sh4R%5`Xk;CtVtKG+D7u4fB?p5K?&{<1HE-!qN0Jc0=Xye*|?{I*LkwXL~QWm95U z>!BRNeweZv=w^ts@6jh6TdvcTcK7gpmM~{Ucq^2fW8DQBdKCg}r@lmzC!W#}7ru5W zBp~*6Y63o{`hD@g8MtFZ^xCtUv|-J3Cz`HHCRmtv^UqWovw-Cb3rt^aaP{Urz)>$@ zeCBPM&YH%tX2U-n^Agn3KqpIn0OgQ`TyJ?W!S2YHhu$;sZmrzMUy2 zE(<05?DQAjnnvN>R|A5oH=eF|VumHhk#95`0?9RJ$h~K(SbP2kUi_`zEo*L4 zid={%7B>)XG4Yn2W}UI>d~UXR&slj>HpqR$;8Yq&&yN~XzD_niI(tF@OK%?0OPoJx z@oC+DgC7_turu=8sE`Ek_Xqla|Fl_npEfc?wrXPn{5Yb~wsGjschB+P}e}hO6~Y$ zMWg5QETWu&Ev7C#vf~Pi(T@cJ8tM<#;J9E#yZH{Q;A$O8@SZO21X7R*DLqS-j|e~a zc~lL3*uR`%!B5e_z`OB4{&C z4gQ&M%xb|ZWl&pP)ZXB4-mpCF%X0qF4@q`hHgA9x)VjV{z_3-TYis;?j2sRK;VwR=wc-y zlobAcRar0;G7ZkXJfEqZp8Q_7zK4JwYV=m_Ad84LYXh@0LQZYO8oNk=3b-^=bX{L# zbHpfah^PB<8#w4z@?5K|K%u2VTPrjrXhvZj$x8P~Y-rh_uaVi7cvGS9HZ$DSkZsJ<^+}k1Ns0egHCzYEd>;(q@q^ zOB>CHu#-1D9NoL!@&!fwRhh z7i_w-^EEZn4L@Wh)iItr4ex~~B18RQVlfoXS@{bX@y@2ia<|u!);4B`33QBqr}@=2 zZvIG>TG(;^>F_t+UBp?(|Hj#i<96}6FztUIl>hTl%-h%h-O&Gh3sxcdKQ{UMk1(FJ zDC1!v=cc8}ZzHNPccXbuxc?bs-e2q!;24zQGZ?taw#$PE#v=N!a@?y46bn)K=&I<| z{be;wD2zv$9dDhC((mEx&!mkUQ6>Ddz>c%7;Y`Zzl z^c?E@awZUqO&{WI+Vx)6rc0c(=w<4P+V3xAP6YJy?zU_<=?`vz$;gLRUt66m`vqb{ z1Wv55YnS_rW7Dsi7!Kw@>O!nCMxLac@|sN8um$hAj~|o>oEX(yI1LBk|1m|%;6oUc zm=aWwBTL(;fbZ_h49cKvYn>dJM9_1xcGpVkF*W+xKeBr2!|Nm%UN$BIoxAYc9|rhZpIU)_JBrc_nyoEfe|-SaSDK zcytD}bn3Rx1lI=e`V7OBL)bfdT$+*)K6@afU+^SKAKl{aUNLxN>EBz^tC+dbO)wHH z`kF!!lIlYmIt%MO3Yj9!LL^Tu8kRU}I;HXZ{lL>rdwl0luK1>9@Rrebo6Y(^{h)o@^kiBudu&p+yJ!S?*@MBG``<^ZbM3RuncQ`20xCX6teUE^&CsKVI8A;c z7K1e)%k$Q!+`9cegNa_CV(Kk&=u+Q3wu(hzh zIvrF%BYzwQV=F({ZcEXphUE@~(~|v4x_W~3N6|j!KY8yDI$XAyU6+5%7pNgC+^&--_E#sFybOw`#CoDS5GSakuswAla#raF6ICSa1uuzKxUtKlT0#9 zcd&D2px-~vA4CHwT)4==fXWU@3@DAXrJAw=^^DX z!t}e`S8rrApBN^A@>Z4)>k?ssAy#>-Do1cVbT^M%;H1LhDv6iVgS)Vltoghda4jd) z%~-_rHc-F_y6iC!!yb?or-rDKlccZzW{S2_a#=IzZL-N*9;$EC)cr7Fo^!sIjj&f9 zzL)t(cmIb?2uf)?^`{+i9r7Cp{6ibzM77K^VBIlKOx1GYbTiW3Rj|!h1NfALk;(+_ zf&mx?29@ls8=u{VF^1IYoP$i0Hr9~Dlf;vi+v8Y9vf!=zW%v7fs$YmcMm77wUt|!z z9H$(eff>RVDnuc?gEuE<)^S7B-CS+#YitOwQ(pbq)D5+9@rX1t?aMz7!mSr;XK!$> zJQ)ux$WfAH4Vuco@M{zxuGC$@ETxjhq9cij)9FTOM`xna?ZDsmt7~{U(%n^Gk{MSn zrC24;kHi3(c%=o%O#|F3Dkub^0%0BF-OkxKm?*6X)WuFdTv#(HDJ5eEo!4v`-*ft) z^f*WmcJj35BV-BFB7?lPLi*$wY(^Om(;+AOFeNRuH}WtPNet4PuDiW&sn)L1 z#GtB@j&{A=SJ`1UkM-smNnN+tRXG&eP1P6W7SC186xG2HNoBO*u+iAQJlW$GPmjbU zeeaR01+y?jQpH&tc#Vz$W&gUK9=wDRro9L=TH+=d=M2lbOro=w2-PMehS$=+YsVG7 zp-TA|lSJ~hh~G3HT5yXx1$TdOiQdG?EHuHb>{OJWP(i1rkM0?C06XE2Qd=Y;h{m>I zxwi?eaIiq@62Aq9u7M^;ZV0)GwunwDk~6s#u6KfH-6);lf`L_ZUF2Em z-hJZF80uHqZ#Dn03WF9|ujvhvf3PjGTH|yv`;drglgP|k;#7x*X+(g4F=@&DOy~hW z;M|2#pD72Ia#e3CAvqThFxY4hkggcL1|@o1QmGk|a^V}W!=ziCKCShFpzYftoWtr% zM7;RaH-$xlsqo-A@l&%$OTf#4q&ZyjtLN| zYd4Ki*|A61sC4{^Qp!lDrrjBKI%|(RYLPlxvf_S2_AeCUu@mdUZuKCqzIFTayC!p| zU=gn-`!n-5`5rCxnuaf|Tc!?H$Gy%%HuQs7ffL^WDG-Ri0{^D#&yXVPQRe^L!7Shp zpwFdxHhigQ<+uHE30@(sl6xHr7`)<1%~0?Z;socs904C~w|vA(3p1n)gVIWE_$P>m zdO$9X6*CPJl38-8p<{Uz5K~AmB$-t zJgAmHBdh-s9Cu`^)RK(nOwmZVW(JptOSF^ha*OQv0j3~!VWcc4K9qjew`yZ#Vg{4Q@Tr9`^+vl6Fu&kWCNw z`twjnV8qCX`e+!pGk2i=qcFP(q29=F(-v5blsg?Lj(8rp?#9T#E(n8ymub60F*?a{ z1wcFiurds~o{cz2z>6=MCIUMFQiLs8-0)TjAIMp1`@I2%Eg^@v)!!N6N!$(6+4P66 zHZCip(PAc19?9unnd!VbOg9rx;9+n*c^fS<9T+kLe^ji>F3`Cu>!@}BO zQm4*lm*K~>ptYpSqBdt*$#erxlRr;V@3)_^WNqhr#TybH4$%`a%%l zc2kYolC8{o2Bgkk@~y(QM%7sw$u@u${k;Eh-JumUG_A=J@UssJn_}(0_u{ch$)=uV%j2EjU1g<% z>-L~|7U|8ba|?n!&(_$~KwxnTpsHNvQ}RzoX*>)#Pt{qe64wnwfBN=C}Yom{S)Y(&I?z>DvHodE8^lofn%wT-9kw$c*F z-0RdM7rI@Y*uwv}z-y>_NbHC^dqlLzO_!4jAk`aRIh5$ke{eRtvlh8{l#+C7- z_eS*Ney=4^nIwHgz@?;(Yyu@@KZXYqzY+I4cupaO7ieL{A+qC8FasFK2CS!uMHy-+ z!+_|)$=B^wbp(Ly`$y5AOwfylvc8blFtvOsdY1q$O%YBIJ423c*wiJ$ynf&JucogBpkJhGf!*$eiUBQ(`e_$`k{nJL5vEE>I9ySh{rc|Il;=>P(^T&&Dgi^agjrl0cJenfLBPv3 zH|<-95sQ*T?~yRxIzA}{RuM<6@~5Xt+|IcSY{(ieq2G`f0htI6saAgtmNu9QCyt!A zWo}+!J6+R?}8y+q7v?}lou0w7{DA-!Z^l%|ueIZ)iTcDtZ8PdzwVF1ASpOW|;O=d_PC5FVJb3w7ZPu_e za3O|YxjQCDq9#F)+4ha}t-zjbU!c|3ithgRB)KKi-zFTX%m7G21iUzj)B^2)H9uIc z`Mm32o>^~r2Qw%o&61k39_^g+8=88M3A}Rk)@^pO`JX(gt>W`8mbeD<#!Go9iM zh;Z!;T>KI*42)$)zoda1_e8~59qF;wqNK^5@MLwR44mACwNi><1bAWOm9!Dz zlmALwTu@AwCUeqj)*|nhZr@_uN8j!4aEs(N6}s+rPtQP3!R~%fdfRmv4}k?Z)?*-I zAg_;cOtGp!pFA#`a=0k`;Tisl>HWWIVFH<-a5Lv46MK&dA=mRPa|b(%UzIY_d7*U@ z(^m+Gvh+FhgK=L`P+H)K=yyI?w*YRv=8V|qz+u=`CAXT_&9?x>s6K# zQ$%aga9>jmvC91j_$){wP0df0M@^>n{eY|pss z7Rn}fgr|<4JlbTFbgq7XT~fL+$Srzj*8-omY(wI;irizDb2 z<8^+*k?ixdZk1a+j@~DM)r02)`C@NmAVb^Yh%cUwbuDu3w2C+J%_(!QHZBo@l?~Y> zA%ojJ#m%m{ugU7&toFhiaN+hyjxuutwJ6!E)^69;>7)Ee&a#zZOP7W0Dal}K`(opJ zk8#o8H@cl)o$}!)2jqRfH_0_P|4sDvB6cI`B@vHv(tm#mq*qDj|2Y$ZpEJ4*|JY|8xSeCH*p0NeMF^o0;T1-sB(Apd%==WC*5dS;-PrEUdxc{HjHKTA zyjMr7W|(w4&zgSL{Is z6r+Y0cPcc0ax=ght7+fQ7Q4Gk0H2Z(&$N1wa`?d06&(4Q1sdNuj#Z1bmHiq)zL=Q| z)o!(cL1Qw<3PkTb`{-k@daFC|ot#|ooM3jdE%xg`C6Xm0Up~yksmADB)jPkftC#NH z{zs=dV#=-e#uP;b-e+i;bVJ{I-?C!ag(YYp3mi@yuwltnz`N8CHxND$Tuyh22erzpdFy{jKn_TGXZg$Ba{S$$kq%y1DVaO9cU``c z<=&*7RhckeA<`bGh$oR4QRbF;FBnhO1vxqGbA73{*|SDG%dQROjgu@IY!-wGtBVIR zMr(meWm!pjsMuCbVv!9nG}uKwQokM0jf!%kWi6>J$Kh%XH!fMNs@T~ z-e=B3;61v5~@AZyl3&!hmrsT_(cNAvh50of(7zG*jzlBx6$Me zcwWt|@>mpT@TNwtk%j8v8y(-YfxFd`qd;vjC{9)t1oEA*II9f3_Wph1N+e&(sR$m5 zOG>%oM>e?r{%7{9p!Yxo?p9Q^77<2XC$-5LJ>n$i8-)+^ntE($ytggNKJQXkv}q{% z#)Kq41KLXssa)WtgNdK_qfRRk-$oe(?mTY(XdES4H;-PZ3R**>fFt8(W-@obWp(s+s3?rref9ON@dS0>9v5-_ z@)8-tC4`4rTJ2ez3jX2JW5P*$Ne(b)h4H>;*Dru?1X(88##q7V4kH%4Kavtxn%YUT zc@`oxoWo~tPtpPwcWg8>(k+FT7DWkGCT=iYpyjzEH1x^@msHI>`=4zR-F(-(q}#Dc zkYXG28nCm=_RtGECuE;6HH)AQB+gi@YRYSmzUFYWJ~T2w!ybvzmq&ka+`qec`{sTe zjvQxNRXA}PH1130LRO1G{;Pc5(1IJ;PLHc){bECU}3=<95xV9#{#D_OTWq2!oRZxJ+aNjtTanxbAR^mxbuf=$_yq3 zUhO+z^0@UN1+nN{CK1Ti2383vZ&3&!ho}8^O`Mu)YYq9-kWU}KVu1iCEf!|jTH`fe zSkNK*Aon6pt@Gegln`T2cQi4&#l*cKJY5KcK=r#e;}e|COEg^rBz-wqT6c%YM^w(Y zpojUvsl4cFbdn=IlHV$KYCtwTDj zB7&Thqt>M^>5Wt_nNV6ZO@08f2k47z+DK4{i$PE{K>v|`*F=tIKa^zsrRX%TUczb` zeYj;z4D$GBBN*XP?rA?>Q@;O}gu z(?jw(+7Kjx*ITMxeL&}3|A%fEO!;p6w1-ox`wrd z1VA=bgg9qk-6S8x&Ai%lp6BEh#8SbZ!0AN#(^i3Y8&YboJ#tA2fO2KWbdVt+A1yd1 z&-&Lx(ee2!YBlhTvS|-H06d=e2}Opp+VrD!c{n||avtIDNYO7X0=6_XUa~0sCBHMj zXy~nI$yLD{J^g%MHc+vXbi69p9~s`5^a4(Y3G&ds6wX5DFgS7_uI9oUpm~u<{4?7< zMhFrny!S|M_dz{Au7X!Gm5mOWXi6gCKQEWll99a=HU59td+(?yns05Cq=4Wsh>|3u zpaPPSoCTFEDj-o9BumaYqvRlAh=LN7IAqB|VaQ4x@`%g;0}L`Sgdu#5@B3T#eCOOh z?*I3!<#IJW-Cb3?tE+bH{X9=KK2lnaQrwZnh47iHY4Mkh3IVQnDKW1FB}KrD;4twM zVDOtqmyi11%Ri;N0l@md!0f_Ip?8voBMiH0Z*i2i_sQZcfP~KMakOqX*4NB6VOqrb zGx+oxkrpFY2%81~PHK(wO--z=Sc$HQ2#?Nojuxq~-5%|i2-T?YFYO88U=j{lA`1tD z{PeDnd(<`Ex%hf^es0LPRJtjUSZXfb;#545lFMG4Ix&PR8s)fj%I_|@CDJ^VTn;jZ zY@@O$ax*99y?wg#l4grJh|heQ=nG*EJ{w5>RMCIVM|cIRO?u!ffBk9Ps1Q{1(E z)2q_yHuY2EiQetNYh%-VF3?J_$57xJU(o0uI}-S>)i2o zHx~f%ezl>2u6b)J;qG!w-NeIwyACWWamqU@Cfxk4&Sv&O<^r(oWc`a#TCFQKy-x8b z+Q(>DJ!a&C-=~^)pp#JWDe@R8{mbvy*I&oKqzBJuD21m#aQB7}N0Nr&WvN0sy*lif zxQNR(iIot?h`nl0zd>@6L0;=|6_ng?ZAHMN_LQqbGI5)in8*m%as859j|7pKPcYA8 zsb%@VBzNXUzKKMzK*^Hwv=Joj>Oxn@9LNHQLr>kmV99F9>K$VP$Hs%qtY}#J%24PS zr%TxegCru)B^wq4x_6%fhao`&`bL9!uIkkX!DUeCO+!Xk}sUZ^V12n35t?!exk) z%w;<(*b$am&ZdfM$$sZcTnIlyq1PUw`he|2=#eL`^IzgBx3tuD<(W`f`Nq90A}^0k z6AzQz(m5_yCpOU>lox9Z4ptW(?Q3KqmQ0*x?*e(s4?EG8GbVr31JbW&tNC|- z@GD1^TiBT_Ihmd6xIo9(DHYhi|AWz`g8$?=r45ue7SPsl3Q$02fDqH9|=$ z)>XQ@mXpDMGs}NZJ#DeMd<|5b&Z$2RZ0uZX2lE`rx{6MoQh1(^1Q`HnG@@7Ue~|xe zr@Eevn`FON6;d&v$rK-I+M@pM93d<+abil46W%hC;ha~8=e5-{Ziq}6FNQCju zZDPd!Dm_~jATh7yuYq1)o~P+4Mm~(Ry(XxE|Ko;|RC3`^WV; zUisqwMU{IukzH_*rq_Gld-w{tS+ZkKz-?ixBo`r3WDv3)d4`n#74)n9*V#2oo^F@W zjNASh&C55&22bqZm6~2kus+D3S?1;+RA5Mwue%}aeM*}M;s#v1T6^I54BnvO^}2q$ zW=cR&p<9;GMxjazU+Rc(g-cYT`n+&UPm8IBg9qE;mx)Mb^i!RZHo1Uvpk=mG4;VW% z3!8ZzX^ZE{&sNg$NOn%cGykk1Q!Kxbi$ zeT+xjFRbZUFF~k=PE{N9cOoF8pVMp4A3!s%{z1Cotnwdx#m{iC$Sz5vk9{y{p6#9y z;e=;{hUj0AmKQkE6D7po^>D1==n2iBGL8ei0pXTtZ-}$>OLh^$A?2BcJF1#G=%Gu_ z^WMrsWE4B4oIDl*p}ynUazmtmvJj!#SJyA^NL(}IDuC$~qlG`xI#-8PWszqJKF0gw z9N6V8+fDKH{xCsafA9J@dF8?tJ-qv4*peadVY>_>aR_^}VGySO*%}Z%r(IJJN01K< ze$x$|_dge3nnt2lPfCu?rbTGNI$p7Pa9`7vPYCD8B<$m+ZmOnNy&jLb?U8r<0pic& zpY1zEi^c}vR<1N*ec#aeIHzx#x$7n40<)W4+=FrMr+pCeZ0CF&-rfCN2zwtRRx8a-exbg@qTq?A`fT7Px(RM(@ZLDmc4QxJ zW*(?68B72`R8N&2C^ymtTi|iK>5CZSts$HFiVrhW1O*Sv#i>JuF{<@hj$7ag$ zK{0Fqa4R1gdxJQv>+7Tq5Ha?J2c>MjnuKxUcytsSi28^rD1#ffS7t;I9YdmwU=Y}@ z(CGRvDlTn^nJnVuU?mj&)VH!~uszzPdhBH8SEhGO!Wbze?qu{t_5z4lsI=1;{__1L zcu>Zs7fylL*y9E5Rn=~#TI5-_bC80Y`F)`doo4gM%qx#p5)~UFt{=#*U9zt(8@>zx z!XrbNz3sn~2I&loaj=Y~aYoNNl8hws$g~-%rIOm$0cjlO&5+LY*OwJ5BoGu2UaM)H z4g!f64s9|qV#V@s6G=su4=3g4!w|ILc{xW!>da*h)txyr05Cpw3mJDky zoHReoA|bn*kjle(@scx<1&6qjgqIvorH&xd&NBC)@e$#O@NENOR^6Tv_gnTQ#K?C) zNERV%wrCc6c_JSqux8wv7gd$Lu)cP#_9&^MQ0u77n6|;O7pZAm4@PKnw4ID`t|d zb^8;IdAo88^LMls1y5Rx2;5>B(oW;NReXg+ceW30cjU3H&Jhse%aPTg@Mu5SVL$M3 zWH`<1!P2`Wrm}N9Ass!^cG!tCZtWybqKkMKDzyeYmdv4aKGooZ4DmV1LG#SGGF zIVe|kTS(v;T-GNa1Sl|zV!_F0^i2|B`>CgW(&!6K74SH9u>7IQCy8-)I!f|d_HCQ~5)AtYm{d?r6)7O_M zG!%9LQnh@CTpiB;YOJ-=9SZ_lDlva9OYXzAW3qhI=^loT1dCEegxmTF-IQFE=2nY& z)1PDrIgMlh?o`&~n%Jez_U_!X!et2ACA1iNYei+wJ(2_yk6y~7k>#b&QUJt)sg1pz zH zpw1dsMw#*+$a?#n?Xdjv?R9_r@8w!+iC-D7Ujy~3CMVr}@~CSxS|}%0=$cCEvEYTf zqN<{2U!8dcs3{!MKjHBtlQnfybag1oO&Ml7m=(zQ7@xC27(uDr*XE7!-qRA%qji?U zY3u);)KzqCAeNtjl*i>rX@}gFAs%GNkB9vQOf(O8Wm=Er*W$vphNGXd#(ECr{{a?v zYAJ!uIe(2+rqYxDc3rJIH1mp2-qq}=XjU;?ic`jB`)U|*tqF`fwM`I#fSWljbazFe zR5kBSLM$_cdjG&0+#v%S`2Q_2_QD$X8D_>aD3Gi zRbIyC0E8-KD{FX39>Ip{67KNVfbD;=9@X)x@bp5OCBl9MhlXpHP5FDks8ZV$5Dre&Bp2$u!ZyCuz5>L-9*9U&eVF?Y z>9|8m;IwoP*erV!hYCG_qozsA{={51FP)-1v%7z%Qz0c#6rc{yy=Iz|8aSCf)`9gI zV6M%djhSc`-0fOGY1xKY9D2j=>v&cB`fh7Bl>5>}E@hI#Cw926ok$^SP|BBxCC6!= z;D19Dgcx}8APZVIz*l!D(Qn;%wjmvl@YVKGg!8;+i=ChL3o8J>jWN;1rSNFq#Sk_* z1k4CDXbRVG0v7nOgtvtcA&+YBR0pEN6Rk@rzz%_TW1i-urlxG!2DP#rh+KH{F_b;| z9A?W@!{^ua^o1?WWVm`gxR8pP4`j*Jhw@`(*%_b;g1&Fw{ZUe6EL55qb^KF5l}fCH z>lk)857#KCn|R8nH5{OrMIrWhotIIasQ(f-$ z;*iczA0i;`1Se*?n$uDGa5Vc~l4lY&)e1R7tIpy486YP?W;q`H=GXLk4Ge%G%kqkA z9HJi>(r&z3X@sA>2<+SCe^=o6xl>AMk3w?l9D1M?W{bP+9!zpJUwABUnOl4X!m*Pr z@Wn17qAj&TSL^TKL|ITAroi}-J80>Ih9&r8>4$=aBCzNsN^(n{E?hGD)~H5awyc70 z!mjLU^F6;Fse6&6ahOJ`V8MmT#2OmQ2Yu z2_UG3^l?%Q4{o>Ah>$=~w70lf0_u*hf+^7G%FUG!eljSLiOr%IGI)xz8la2-+$&W# zTX{wNB_L()&z>ZloN+hW|J96%$D#c`E9NRAd|%Ls48Rf!9CXca|-fgRo-8p^Mgb$%GFQgNU8{{&(_M# z0zLsGIrPT?fW{wIAawTf01UEKR;-Q8EvJJ->uDsofQe-Re@xd{N0j9FmLelZHEM=| zmnDD;Q8sz7EQ@O=3vuv|c0?Fr84#*@$yA;_CguZv8jPSWxu8C6i@0U*#fIlRsAtsg92X`c?Bmk*aBUK4UY&s z|Is=euw#ZHt9NeP?hIIkbUeXtLNL&hxx8L_vBqPA>L$qJjY@ok*L7?iuYMuWYcUA_R!hjuvPEj-!Feu%qm@>`!{>Pr+d1)oF zvTg5d^18S?Aw-H-54ndL9}{VLb+e*Q8Trdxh$u8YEPX!sGSTLND${`|kYYcFLzwmr z9c?^jBb6h&F8G)^CXS3Ol|9Rb6>uE2xNK9DmbIPE42B$V+}semi10t1f!)5p=ABa< z9dK41TnHuRnWqz}N;VZA{NNonTlpNMJ`f^af$rex{1%h)ZAijbVh5|Wo1Jlc>N7+3?C|LaxVAXao91$Y@3Iq1*t}oKO3iDq#zdaEx-q|<6uA2 z!aTtgh3=}rHg)o+9I~>Apvi#cmyni|bf@3qcml66cXsh8QP@75NH~5s8m!#h??GYy z+oc_xXQ-y80xG1QQ<79nViY#VK13O@0`>3BQEdx|Z4Qed6`os`8pip3u)7XKEG}jR zxT7tzfys~@@WCLcN1gLlWJi4~aRF$lQJmS|sXOMIvj5mC+@JUB;=X#Y2iYa))w>bO zL$84wnn^}I^RTjLhAf2x-!q!*4w{Qc0?S66BF?pFy>pEQ-k!Pn9Oc9knHdXAHSRa^ z2(stCG&}OkZJmVBgMY5hw4%N(BaOIucV=fm>Jmus`h#=+sKAKdvo_$f}-Z=GKBTC_BpjgXwU7MDRR)9sxuIAtF zwE!H8UVvjN=E%HgaBEW^ijx$ow&d@ZF1BcaeeL{9_Lou)>Qr ztCU<#W{9oA`hXmf+J_c}LHF=jpdU?~81z={2K359*UnDMu1Me|l?eWntzjz_Bo z9IVg9(yTK~@dr~qRj}7{ut5R!(SyPd&qgPq=GMav*6Btlvx^*Xuch6B>p{MC?a&1X z{>L4A`S<_e;s49wMS~~G+Eo8#Yl*#s>io~c3oCls3+nyL35YL5>Hp6|pg`5!S&e7^ zkWL;Jl^*@i!wdIMuLOVXf7auFtMvaE?0>EFZ(s5d*@Pu7&iD;|Pju=?!d{Tx@3-&H zPSG1XPy4ONRDG_Hv28hPz&0ENPF=#53h-^MkNTNmj7I&HoX!mKs7vhj&gY*o3QKus z{oOLN<5eApZCmTc?YLO{9^59+oUm08hclCY`QLgyxwZM>ne!|*J|jQK%n9LvXeVd+ z8KjQF$@}-|80R1F(%54p;Qt|(TzoyeJyZTeY85vOOv*ejR1u{2)-oFDP25!ot0x1s zONPeYV_w7Y<_=gwe@6Zb;}^5D7`fv5A65^(46_ev`!5*J%?0-n6n^Bj@2)&(EHRfj zuisj~+0kAqciPI(UWcDkVj>LY@-wRsm;6f;WqvqZ5-vh!gKs?VN7h@9Ti<+h9D$b+ zDHbI3qhHdPtSVJBn(zOa{i-E0;{Fc<=AVkDo4#qD8fj6m@;KH|XfN6SBN$NPiSZ78 z4%AU%65v(^yE%K7cR1<*I3f+D|GPX`)*Y4r)E9+~f%_b>i6T$_B8^a&>oe9=<1CK8 zjs~}!as_RSA2^YZJqRYC#gWSnv1e6sr$^Y+uX3mD7-y%Fli=lX^KvI#)Lih-uSf@l zH^JwCBFT&y_U9Lc@#CL~K?cT+8LQo0Ti&N-p5Htl)HimWxGexYC+pP*46T^ciUSiI zVu~U7aISI{NQG4kKwVo`<=EM5{dftOU6?Jic)oUH;C6zH@dagineT?v@i0$A!xVo&zm0wgbAW>M(MAlsR52urH_l0%FrWwdDb)_wK^>h zOlViPbKOj#R67X}xkZ)oY(aFpAGVLaVKHr6ROHuy^*ncL$-tK7d~j>{Q63ffdZF|34v+F!S2?FHqx-)|{^0{|U_lOL~+*HU$yosJ^}zD@=Il9=ZjfELyH`A){X(KVRao z;EO4rKAx#qVm8GlaVI#GDB}o(`GcS$Ooc~2p#j@sh;5E$`H`o?L5HI-nF`#Csvs{v z>1@a}VjK4`e14YWlxmu{iTxHJELwKtp-bQ0rQIUehRJ*RI?HKWQ=2gamapIEY zm#J-k?1Qgn^Xz0>t1@v1^W^fL#{N>4(g!$cjI%DEN~?^y$Hd_5xUc>tC?{$zU`OQz z`zh<_w z@l+IoVaD?ipoXgEKlr~=D0#uXe@f!s&>=(i*_x+xF(-}Yh(#2Wb5ql)Is{O!sCfUO z?hqILqh>r>f$$#R{!8b@;?c9aEae?t-z(IwR*262;mkkrwOpEL7jkaxCT|KaJ1TO6iDdt%C%gB3<{@4eDW@vu(l%d<3_iK#X z5q=ndg`a-?8xD2Bqk+AC5OMG!$Ng#$E-!F7c6QEx%Xmp&rg&Xsz$(wwMH7a-O?U`n ze}?-uRDv2;GNK@7{6#E&Jh7|Z8MG$M8($gg(pi6Atu71Oe^uJJL--0U(L>GlG^9El7MVj@SkZNc!ei>nYm z4woe2P}|XlHQCJl{vZpvdt{O?&SY}jMlLxiu_kxOvWM4g$o^ep3s0C8(ux1Pjo4l! zu!~7o0W^aAB-ysim&#$gPEp>hxzR42Uq8M?5fCd`qjsa_CR?f$E z--bvSwW`c^bif&1UXv$58sMS(huJ+%!0l5|^n(g}CNdeimBb7vxSJHO$Jj-XHXMC+ zUz(_2bS?3DO?P)hm_{$rh8*!&EY;#jo;=Ru^v&0Jk=q27{iCW4{V5KUCv+eOqi&UN8?Jxc2j9peOMiZ^Iw&|;SB?g7v8BM z^~DU}^f)5iOq^;0`aAdq81zUBm8jXiNM+`y?0@r5NsNNRCt-z%@MP|Uimh0&wJq;O zCy%c~k{rN$XABw`pCwU3B+%Wgav5_t|AQ`b^p@7M%0^oJ07lhZP%L53Xf*hh6H}Hn zKNZrVmRGKMA+DrVr=|_>ALPFPJ_2ws?W#f~1q6j<;vxXKs_PD|Ql}@z4I)j8%C3)6 zLGPxWCTlk8N#NqbS5g+8GVmSA~WHRR)6Z3YE8i?=eoyY4jp2 zJYx>?Jj+|yh4*aj7g3T86Cum|5yjzvDQC}En6*+5G|$!mDcc2b3adYicsu4_<6{Eqq|GP z9KSIuafqv1(AHtiy&p+>w&$3gjt=A-TimO|-DftnY3)|hMAO?XTMKNdR^0nydmotX zq8|?5DjQlAfnKfQ2hvG&e)Xx}ANh_V;Juzu)0c9<;6i zN+y(k6sm@5y83T|zs9>+%U?d0Y`TX~cz0y^`YCR`01#hUG57RxVhRkZ-VE~&E|80N z38BcV>hYHhcHW1zmMtL|+H2(mVf6h|&DhiJ>B*%rp9oUC|85q#aj6JGRKxNo_dt3{ zMY7ZEbhfcB|B@ctb|4gtI?d6Hd`|d_1+Xjy{3d%~4&VE%wSJrBH>+Ue`)=Y~p>#D4 z;VGB6SUOPm9l_u!+s4Ng_vT`unE*S&gYo&BgO8$SOJ-^<<8BDmIHwn9H-;}hDxtGZ z{q%5NQ)d10pWN+-=Wx;(+wI zDQ?t1K|XYMD;`jIGU{rrXqYP#;7DrjtWiw|2Xg$LyQT6L6};!FOh-P&iK(8p-fOO-wpsTB9|v5fJmGzZTXq*%TL2|m!X z3NB)!*ZSMWZEJfGyC~iJMeA2rsgWViAoh}0YZLPn6cnf;!9*nr01@Rurfw5A0d-4E z4;}TmX}#@LYs)B}4;oOGEJ#JVF=+)iO&7Y40UY_uHx(*#D-0xF6?CLzP<-wC70O6bCGuyo7 z@hB{uYGZ$~pS3 zF^gk-m%QHCitaHbSj4=L_K4GqE$+Uf-uLZqQ`P@%s**pudPnQ2vGhys$kc+E>m%Z9 z?XP9HLp{F(Z_!FNw?QwwH@}Ec8)k5IOEvy9GW;R;#k3sv>IRcU@=zwL*Mij^TLHKD z004`k?;;9hb?Fe2zETKmXX9(U4z!l>XS^NstXEo0-)9DzXixskwMx5D@B2Kw&~+l< z??6B6ANJ?R!DSV?QR5a2beoq2+&_LKIEQTeF83BZvbQ1;{B#sWG~dFH71wmR%Ob=@ za_}IG6)UoMDm=8qlepBn5Grisc@W$=umy_3ZE+3Z*d8(>d=G5u2Gwlh%X_c`fiyp7 zZ2oJzi_*i+E-?Y^BrgPWb@g=A}!T0uP?MVa$W7n$CkGc?<1-JuKn15szYw@gnLu1C=WxB38J){*b{REFrw%T-PU?;gOep^$Wjy&qqJ3A6q zY2)9cH*tsVhUs=97ale}5OEk^7n_x358ja4@*12cozGX#73&Dh1sra#AFI=K*#a&d zV>!_uCI>#SAYM$!qSUVP%*ETm7@Kt5eh^!wT|#kR`6-d@pJJnGr1H)J?w3|JKkdPY z{sO4AT)+50n$S&N48Ve|g-(_1&Gy|WUH&CsF@a^%7jHVUV*S;OkUn5WqYU(AZlN>+ zYu#U&0N_I@)%SnfoSOS@QKJ8s3jA+q=>LZSH2)!UD9i}L zSK0J4UrQ|AI>UII=AX!;+o0gNI$gXqAi#H$y1(Hi1e%LRqrpz1%@bDtk18j>l+I)PqFeFB zud4cn5`?n}NFM-lih2E^_=@t7X1b17$G^YQF+Rc91IL9mKuGs*Dax9e4~)FcFY(zd z{Y^MaN{Bc&ToCLigo|cwA?y)*K|D1V!graVyiCXQf1cCp;r<%71(2a)jNU@;T~ro$ z7X4rse1y;D3MYyTi2vGxnDhQ3{Pe%Sgli8KvnM%zb9{Mmsc0;B%CI1C9dMT>kMZjz z_RP(rf8Z3aB@Q027yiA?l7l~cGZhsEXsZ7%EerdnvhHbvsaqx^*Og;PfT*`vtW&C! z9`G0NR7M&Zo;iW+ARoNxMH#i>a>GkUe;XJZIYP~ztEB*My`b9uYuj!K4SeNATlhKc zKXa(M>)E`$_;Kx*Jbb%U9@qj6mgK^rDk`-krkrw7^fkfy?e;5CFzm%unJu!Vf8SC} zbJsoii_u~l?S>Z;0)HP3{YLtE^lA$0dqKk|FP=0g@8^z)c#ozJ-~QEl|Egf^pXdE@ z;)wCVEINoa&as1#PW8Qu64h~4Mgq{#NQWD2be&cZyg$h`*KR}riX;pdRJYVoI2?2r zuvagUp8h3H=&)$&FrUpkrXCWE*j=$;vd5=Xx04R}^cmsle_@^2ofT19Fe~*N8@Y#$I11k#5dJX){ zMk8H(rv&2vUX(YKHuWz_^TJp}6EWKk_YLL0K=bacnH*QZw;$v6aX}y3^#rUQIgp$H z8~;2Y4e?AdP2F6C1Zy;mNvl}0=WG^2uo6{AZF~r-&(_Y5@uQFqEw2<$8y1t-W|$n~ zw9waIGP2JTpsz@T2Ph_B+$`1Xs8uF%>d8Pem^m-O9hP{@|5<(9>^>$XH*fO+3XL4H zHxf+O^0Ei;e%M6*=!|EBm5;o9_E}5!gG?n(qaf*tLk7o-n^aEcmCeW(v^k#)W$hpZaUU^0V@(DtSkOF9=FG zHUpFle65>n5zia}X>V5! zJz%**4#SD1Q|Otso=^_f8e*ke(YxSV+`p-xj~`NWg{BdPxhs7xG3E}Dj3L$Yx|;JAUeQpxY}uiv&;3+T?MzC>kxHDCaKP1UR98VNwhZM zJ^^)bh>UR_OQ4kGw30693pvhsfZ3{zcXTJ_iip-43MYVx!%QSCZkh{qpmwmjFyrBG ze3IWzSX|?|6P9TbZq~$DeKU_Cn^)&X^+gfwDPl|>_}cs(Cu}+XpdV5EaFn-Saj2Cu z1+$<5fCc^06VeC^yd6R;cQM{TvS9NEEqfQZ>vy+3Gm^-agCpj)E*BUOb^~;@$UKtN z#|RCMiz_K|ACI*=)xl~+SFHhD5NJk_n2UAw5(O(+j0}_<4pI26bm*PJnxuu6{V8p9 zgDgEe3+v5|TZX>$jG!Xfbr>e?^M&WxnhjDrK*?8#S<$qu<8?d{;jOLajoTyK2tfqx z;b+Lq*TC;mLrXRkqMXb`OYrWQcIf@c@RDas#OL?>+X2OwEH2_-6vQ$~M@p!NAaN{! zYdlOa55Jkhny-cCVBv_FH)1NhDxbg{9H1>hX-S=9W44_Fz&%ZtCAkJkz@(Xd`w1=~ zWAxO#gtjAb29Bxab6T9+YIT)UtnbCRYRh;8C>yA{!_W6dRZA&{FNIfAjF3~-PjAK& z-DQORp&}=f``E!PZ}G8o>6rik32%^&R!r-p0_dHkH+W`ly_cowrYMKo7N9a;dWK2Z zkAKJqit7vrYXsDf@d}P?E~up3PLZ(ldES<+k{=yEP7cU|9vBRAG1|y0DG+#O&NF?3 zZ{kwhnqTs&rq6sA0OYT`9|3rl5#qDulSLy&#HHIkFBm{(nOfZa;34jfYG)A%3fLd< zv~HzCDnP1W7XMUIvf~(JqQgQu77&hmucDs}EXAA8?MOllC#WJw6C9dERC7eQ!*M#} zObtg=n3&sA7fKbB!^UplfjbPpXn_(TiuA;Vj|A0*vc-P8EL{Cdodm1{F1##0b6&ko z^qzS~eU!#k(iL+DK?`%TTk>}qaNpBp$ArZsy`%H+>pz*xY=djNe{(* z;!PR*w*X)C)_NLkxsTZ;AFBk8Wx;i?Z5O&ZAMa8&&^ zv6jfpQsv|xgo-@veKFX>kdkcLL?4K`5it>qo%(n-7jOZF^O)JDoD@^;c{sIhP;?#C zbeZ-QzP#9X@nq}-A!-SDQb1bY2yVAZ&8b?%xf}!C$f9tn^caSeo?21s|8JRlxe8@AuiQ;;8N zE79r8Df&wenA$9*N!~2`tPAbD#_cAJjQ&v=^#kBtW&yLUIhzc@EMccf1SDyyiOhAHx8XOV(!7>QwFd@3%8sH&a z>H$ABi2({?o^{Y!v%%p4VJJ}gKjwG~033Qvez~HEY0D;?T`s1PB(g4=`t6~s$E*bs zSs-G{-KZhb+xEEi7(tsn&qkRNsmpUJ{FHjpo>6{=5Gq|Y`^h5ea-CtJ9Vtk=vwbk7 zqWc}b^$lO9JKhNX2qLfY%{1HYPQOlJ4l^>AgL??E;XAu_1&%#aQg5Jay%JI7GIH{Y zhO=GtOU~q3lq#q`wW`zPGO#XvLr*hl0y*i(rH=5Q=4pb^Fx|7mtNBNN^zk1y2(u_O zYat88vlyA%uBROBzvn0M z#u_JycO#2DF}aA50wd*XFm1@Y#XX>J7{2V^EV#_}!SW7b@bfyDwfIWd-okM0<$mT1 z-0MZxX-`TwojQ37ng9it=iCK!evB?1uxe@BGSi+aY2MDxj=k&wx~p2``}GLS*0!vy z@ZIDSX1dabi{-szIKaXwQ2&We7aca2)YJZ%{pjG6=}U!2EtZFA2T<>Rm7Bs(k>7u{ zqqgeT<|CJKA+b7h*OX&!D*;$tY% zj=gxcezwD47OJiGdu9m>4QsY8-fYZ{inVp@$CX@Ij@4o)wWXrMBk@8N7U@*Zg}2(; zYlqg!@dCO4S|f$yA2Dv_p*w3H;cR9!{HXCi6jlV398_Ck{4UGboD<^aT_&oN;%Ttx z4inBS0B80B?x-l+Bm$tY^0@>)n_nhX88tEeyL#v?j|u`YUa4L>Ck1R~skz3K`a!%D z8EjwUI5vht>3@^8M{*AW9v+8Jkk>xz8ho!0HO#aIe;<-yn`UY1FChp0fw~|jA z0@T^80chjw3Bc;U!OuJQVKZ4G6No^%x=0nnN{qe-Y)u005=E^X)H!WzK8!kw4Z{H-7%66zkDm zdudg@Q*hFFlq5jB5?C11Q|dF+vcR1S$~vYQemb_t5#_jkX%(W0 z2vSz8etUFe8HgXfI^+QnK2h-ZCUMeu@%|`i>gK-3Go$^_<*>5PC8)<_*Pls$Afw&= zdF2t`&nu`)WFMV36+<Z6QKJj$adAsB!W^094dbF%-AQogk z)Ezn$QuX~LX#Sv=Uueywm5QDP7p?l5artgCj`DhMn4r4!OyU0d^|_*RmaX7~{VO+- z%kIhF_nTVpCzwcvykZvN%47AFC1mW>{Kh9|U61a$70c$*9kp`gn2aPMd5hi9_j>5m zu~pY*2UQE65*uFE>B#%j>l!(^+mXSiL-JBmQX0aV|D-$g?P9Q7vs);9TA{`S;TS;# z@YO-Gf}k59{z0+hh2L+!tg4Kod3(K{+XA&Frgmrw{%uT2K`pLPO}?NT)nO`d?o{6@ zw;NdI^KfXy{mj-(5TelV>amfq@n z7xv*cp$xdx3!BO0S2ejC+3&bO`epT9PlJQ{nF9HY$x9N)Y#=*$$3I=5A}2AfJh<$5 z<@|07XY0On#~0Pv8F4W~9+&sn56PZSs=)6z*Uv+<{nzt6+l_?Gvh`mNsLvDpm* zk)&StZ!QU66%X2}3u4`P9sFAt^*MUi&LmdkO0`t_n%l$i$y)yvGx>Y4@~{AKy|?pd zPW#{#%Au5R`fQ`$;qXiUtEF?l*~E@+=fg+8T^-h3>qS*+>uLI|R=(J@he+THB?AUd zM7fs68Yn%hDk#66{ixdxOzxVr?;-iN-FE!6Aw1cM+Uh%T@Sr?_8yo7yJ=YRtn8-w) zKoI@1x{%yubkMLZvwiL8-J?FASCE^A=<+Y;<_UXMn*`LP+xNQRwh6Tb=V`8RV>#u5 zJv3MCuw3oz?9sS{Q=jmH|AFS|&3mA;LiE6qY&rnSVLE1Ma)w5^>BG@Pm6Tmc2dOADHZr1$I#2^UPh&F`aMaX_(6w z5j?o!#W|N+J8brA2X<9T!{*^^6bm&dUeiE=*R{(gEEML>h$wjQZb)+GorjmCWX_*# zb+pU|fhaEI(AO&%889aO=>DHq@I(}qK^ze)<$B~q!$J{nSOh?wV$$=%n9e3M^$yQ(_YqjccL&o zOv@AFayZj}{;IxJh@s}8xJs%G3fh;eh)8uU$8ei&%}npSEmkb7== zs%|?r`QZ3+GN{&plj2Sb2Eo3**}~SRzlhCiZ!xcm9U?M_p8bpok@nf!Ga`69oGXzHqdHJEbSW%KIfie z4yD@ugdxMbI21r|b4dZ~G;dn{o4B)TjUa$jndH5MwSTUx?TLVLaEcef4_D zgNOG5xCxdmE(PPus_b!&dt=FW&l9zm#P74FvXh*FYy#}4qxE4C4&eM`*rG|YL7M~VxQy8+B672Gn@dSXn?=D)>tQJzx+pI_;_WaO4;zp$~* z_go|LxDE_C8hfn3|1DQbA}i`lDkZk-Vq5RHHQ9F2TiH+^=iFE~c_#EuA4y;oZ*%;Z zYCq>zO49dOtZA9T&E<;MOomn7NB_Gl6e%|xJ@CRHRp}yb7IVUq_|F<9WO)T0$b_Ac z+-6d3^}ZQ z-O{+rhrv#fnJZ@T(^c)J4EZWAh+T^~SW2Bi;^xr;9U$eN$nnCqv;Cd6lQ`P_gSSW- zq#`TBMrIQ6L1UiA7SN7EllJQq(GFdJ+5+Z!STnE_@wCm=xp~r&V}MLVQyS?0yHDEJ z{NtUcwy_(ah;SL2x%BSP-FRDu{h?FWAq35A0{%&e3szzwRpZt(_5tL#OOwyTC*XjX z+*UWSnB-M+D?YYs>pl*TB8XT&k$XGLR;K<53FH*9j{5nMwShd};zKRZlQr83>Ox8oaUqYk+cuLq6vFEX!P<5 z`E<7Yij7smPZ@S&H>1l5%TmdAm$=d2{E*9={c+K&u9CW!)Z@8L3m>}HJ}Sv=~Bq|DEh#OX_BP$i6nD;#cp1@0?z(wEM-IvzL&HpZ)c^ySn-_)7K9wl|cSy-!xG(UxQke7L`nXLXmwwlNZKvcH!gS*I1mF-1i_ zf;~K8B^kaXOl~N32veDQb*?81v@)QXc5dLDC6aWy-09}e<-Asj1joXD)ODqmrFSa& zo>xGpOoQ)9m+1WoQN#+jjK^VjWh_`LU-2bu2VV@i#dc~R>lE(aXY&tR*{1s&JJXMLU?$s5R+0+64-%2A63mr% z-yO-9UvcT~T6@-`n?&gILN4dIuKf-gC~L*cJeEvq*5y*t+mF_hbP3Rw)l-wIr#Xn4 zf%uS+$-QMRT2c~9`YBkS>%vjraR;e*+)5~wjiB7o%fVr|=9cGH#t6;74S`yXM*5>= zdxiV+Bx$?5X3o@+ok1FXrdd;z-L)S zG-stnTXN@ogYc$D8U)EzA81^|WhEzwmL#s&%AvnLZRofy;&=9Wo$%ud`GA4m{y8(# zq$evdo~Zp6f8mieewg0>c4Mk0)n!?}l@GOhbAUf(E5~K{`!a)bTx;L^Yv-_j&9rFk zrTQBOgY_XzJM)>*7LM0I^EV0YGOu}GEM!*|xC|}bY_AG05N&wx34Q`GW&k1>a>>75 zeOS0xBt3r>*%|nIhVFw4wtuCP4n!TkT-t&jJF8<_O=)(@e#P2k4*4zx;$`XQxJ$bu zoy>IUhGhCf{a+~!gt~Rn`Gc>G7e1i-TXfFqyJEZhw2lK)^G*;s$8RW#-8WsYxG%>5 zyIKqo-(s}Lb?Nl|wLG0t!dckk)v7z>F5MB#NrLaIkK=gw z9O&laE^0RzV%BxaGY8tSjqL3b2!lz?{;E3cf4kVkr z&#|`Q-9DxPw7*mvhnUN|$;bxeF@)8E7fG~q<8H4@amNFgK+ru{JY%=)JI20uFqb&f z3~A#lcBYHHU9WHi6$O2 zf0{MRm1W6AGold|a{WMfCL?B0s=+crtY`m3@nd)!olWlv^F#i@h>Ay{HbxIT7YEjK z71|lvWzMdlTb>M&s2~rQWCILJjB89k(ydnpZb+v8VNPK1Y`NiU*7IiKh(gc8d}wQE z>2r8RKdiikcjNhKxA??4|J?bZY+2D~UJR=NeV;a{*?)V{pt!%^TMb0aoS)@6s#_08 zm+)~Xep;}@YRs=(E3R|y&c_FFVIKp%?&4ogN*bxlW8_2OQHrHax!Wva8T>;UJKr90 zEsD#8#nsL1!Ars>90j9H*hcwS^p&IwU z3*MZYzo-r`EB5~|^_5{!?q9TsC?SrdgagQ+bb~lFD5Xd%NOuXy42`rj3pA~>pZkSR!sGj^z4qE`?Hx@iblyWmeIC^xM!&kM`u8pW`vq%e<^#}u5{i<&{rdZZTguw~DiD^%<|MfvtV2+;^LdSu{r!lCGat17 zXh8V`^Co;fB!8uq#H%T0FLe~A8foiga_JLyvWDt-I2bK~bQ0BWI_-YXRC4RQV$2B(6<%(w z1y5`l))<*<;RjC1HWhakID&QP8nd&k5KXEpTW*coomG;SdeUa`?|SxIjJQ_bZ(fR` zFcq~@LXk@m1*<8j)gx*5Z{yL4WydV2iM&}OuD2fYF%#dR#RyW@+t@dvZ<4jJvUFK$ zpIl>{n?%;W8cKudnBm+rMooGj$vp2jw@9whS;%WO{?oPP$n6+bH@9uf(^9YJ%W{Zo z(dpYB3=(x!6b_19b;iQ}_M6tCc#}dEPW^O1Ujj zjsBw3U!l|AWS&~*7yk8P>Ifr0eb{d&%~GzqSp;I)9u~?Ovd5;;S5tfAxd}|RAgj0| zzh0e#*_`)P9-UZ6 zN3A&Mv?$vL$Nz`9D(YxZ*ZWtc0~H9mFsO0FKFh0_xKHvZg^=*z9ChD76q;8O<|6EB z)$BJ^d2O;H{(6gW^|pXoX&d^jR7q9Y+@*9%4XqGpFv?re?w4n&^x<{>NlX+Sq1ZdK zJG<1nA)lg%6;nobp1sD29O@0IEY=$BwbM-y`?D`A-kTrkzoj$&RLr&~&soB?%A39X z#xOzkEBlhS5jNA;W!<;5TW8QeQ5H`i`p&GDtKVEN6E8$zES%}}Qj|_HG+MAx>X25< z27}(eytNSFvRQ&*Vp*8@B)cM6_6t7BXWE-VdeX6;Qe>s6J%PO55@<0$^F?6}GD#h} zeDsZU{@YWLWOdp5$TcT4zw`G=h&^i~ry!9_=7v|>!^?r=uh*M?1a9#qjpjMAD%BwT zHZPVP387!k&GVFWE*xHj_HQv{?;q8@wD2d|esp;y3pBdrCeY#x>tLiT6tkQ6*6k@G@g|j4>>*~&lOV&~S zvtJrzRG^Py_9JLG|2+nUJb(ITI^BC9w)BAM=OlzTq2>+GxUCLGnd4!wso9eZ!nx?r z*no#(><`ElQ(wOJpZpe2*ggAdiyMGkffZZtyV$`nt1X|5WX+T&Co-YZS5SUzTVT#m zS3vL7D`hBc+{}@7|48|A)#xpDc?!3}m**X9#KUSq1aIT`+egf{bS`BK5CqH3+{8r?sS!sB& zaN6|6?`ZzFQ;+RmLw+uFxh>!W0MA|Z*oF_;e#?S-|DLb+#ctwfr=Wx{)UMfv+u5++ z!VP_z(Aw@&l~7&^!$z*^dxYSL492?;w%NfRy`u3q$a_8IK=WDi#rG70;2_T(=ue6h)AhaUoW1Q0nxUkcbi=VGW^k{zWebIw0>+aAT&g%#Vy*xrrT*#6 zH#OV+j~o{o?^pBSl80j>|qv*&7`7_9nGn}m4gWtsL~4?&z{*_>GSpBB9{b*Q&_3}7Qneqv6sjsf8xCL zCh1}4>PuVwOydb*#b26G+RSSJZuweFcK;k@f3~RsJ-bd6PgwecLb?$X{?Ie~0)s^= z5sn2K_dYy3{rlFwdnsnwhbl<-q4d<*^ zOnJTxIkEf64web6ebYa}JS$RO@efHg@$#*Nl)}PW(qg_cQ5=tk&$a$hhN_2lp@RZ_ zjdn%dILg7@sVCr40fsMHjUbM*WEnD$1+js^pIsh(4**)nd%e#3WuUqN z^yOWJr2J_KPwBz*zh4c?7cWhL24Ykfe)TWX_GR%VCw(EQex9Te!mfXff%Mnigc%2z%xV%r08*6z?(SKU60=E)isYex zFHWar_oxH1cdsGY;|Q%3;BVMR$CedfxP3NFM8vdhG|~A=XTjb88}g-CWx;1UA-Xju z=GOK_3tzd^VCO|{jj(9)9kbA$C`b;vp=PkUGy!XNcX0X$Cq~)rr6mtSE>D_9S z%_4*6*?co!0c7?OQ?>i{sQ}6D0b2CW^0Uw@LXdA zi~0eRd51N}oT)RW_=|}r!%Ud4Z-sA#Sg-$fX-;;bqdEyJbkmt>+=KEf5&4S!ec z#14pvE#vig_9LHR>QX9bse6NE`qGt?-SgBo1VBzq zGP&BP=iB)Y1*n)>H@=VG|9d7z{?YIGJqh~)YfGS+Dvw~2W#@rit=kCr8KBaqz1(ff zWYnzYCm|H$0#y>W-Yy9gcymjU1>73ALYw>^ap#dJNW1>>@2{VL;CMxwr;P26{&Kh} z0fVCPo<|gR@X5OksHKh<&`6sV%cpQA*>!y+CWH;BDU8d`brjj^|L=Z1hk0L(^U;uR zH2K3?2WLA~+i!Von-{E30PmweQv~8+1ruKw0HF?-Pa^u#s6(yfkg!3~Tv~wcT}*_# zJtrn46V?J@bV&%e1w_zxkOlSbH4Aaa7yRo!D3O$l%fQ5b1?XAah7d2MF0v$YHuSzY zM6boK4Agh7CO}veUy4f;_qlNaa|ThBznFw?+ERfQZyN3fbODpWk|xRUU7J;*rAWUv z-?uHBo|LD{iQ?V09w3tDAh*(Iz#PKZ9Ro6g$*L;?%SR}~eYO$n%LDNU7HZJsZ+r)q zr$pY)7T?u;9!`_tY&pO{qKmWr7Rk`w_KSsK23I**4Z82ar8=-Bdld$b)Dt105<4k+ zpT-#kUwG=uBUYK&cB2oscO*WGE;EdU(z~$hm)#H!N4INR z8|G^9;pRxt-kly6u#9?u7U!}naI$5e4py=Aqy1GX9p7gKR0I&nX1Va0mcT{{%z(p? z%Y3h|>=hIbqouG#Xx5NLb@0_L2|p)G^z&lk)SeD{3lEAVdlz zs6M%6hBIT})H-GC5S!rZ(7udu;Cg%)S>-=0mh?X5ZFaXNlq`#a8)5|9m&c1M-pFUb ztPm8=ePTmgL+-k`!j)OQo~moq9Uk2Kn87rsk930V(tTwa^*K%^?;0pvI1r*2Z#t@h zH6-HQG-jY)VB$d5CTyZlJ4A+@7~Fv{gS~z9kw$hB#*}2G#__-EMYiY(nZv3#TsUjf zSqU2fcsO7mSER&=2?a@^$K%dfB5mX=ljHlU(7`=LtFd@Wz%Tk(W&XgO0z{B+P;&AV z+#B{-%%H?A0<5eCj(J)p7j#`>25N^>SjIJ*Tr>l*XsmZj=q;^Iq>CI@Z6RS+RPePi zx#t4c(Yp}2u~58>!tsPou#0n1#vJ&DOKmG2Fgrqq(h)%6>&kmY3YaP)Ld0KkGtH=r zNsIQ}4Sp)fJ5O1$s5!-+5K%a!reO8B;V#HKoHgjSYGZx(eMmnyk&-Z^`g*Za<#NP` zQQXIvgfK2;ySXqk==P;2Zvm;GIuV=PLkU(vyKj>gGoEEez{nm1)_;Au)>>sD$!dt7 z_|>z{|MJqc(ZDY?)Eg#yPuYO}%rX&fdr!cN-%uRu!1D0VE=CGG;EA(+{EuYzHULjz zmpjTTV0aw?;rPJ+g();ivZ+%vB~44JedC$j%l+NYy}cqi!=-9KC~iHW41?D6ZN9(E zaoAF4>dTe7ddJlA$g^mI?sPfE=|lMAWH0;x=Ul8=3Jgz|5qMxe^u8b_EPp;a$F~Ihq7%u=j*w$MeXF4J37+oYM%f=_3g@u^LRMXGXKY&Nq&1e9-LZM zqQ=l40JC^y-_M)HNwTP_zo4#BiHJ1gV(6QX}MSuSigmwTzMGa%22dO*49;9NdJd zMH952n}4TYxax>>P;nN_7@FC@zIx)ywMOMZKK~<=2&BoNNaf8nK16Pk=9KN$e%>r3 z_^q+nSn8WoEQ=U~)86>HUAvHvOVN*O?YRJgec>HMh%mXQ!mqnBidNv%{aq+aAMZ}e z(K!$N>}Kkmmto59osW{cG{(+r0GeuEqTJ=)Oc9`vUu;0w%rTCg8Z<;!Y#F*g_`bJb zOZ2QquexpWVXWPhNFmECtkP1TmS)puPEsROn(WBH@s6Uer-6eR*Eo~nR6gT(253%U z82KqeOxqo&xS}umfj026>X`%AXi8z2<@vvO)$Ym>m9I5;KKSFMHPPPA!a32o8QRml6D3TDGdJFAAO(30I)d86)Y@*%aA*H znjMcFdM{inVD#czltp2%#8$cKfiM>)j*%^Xwf3JtA0=Ieh_zA0zPW1%o0{OgW1|<) z+%kXGPJz?1jFRp}Iz^jV#++$dDRK>ukRP5nc-BihNETd>X=9v~S+E(o$rB8xlx7rF z7=q6;IQt2Z5dJxm`w{iP<<0W57We7w8w8y&@9^o7xoXiYjc9Sqh(+U?81st{=7yP{ z7T@Ft=Ex!62@j$&-*IG5=<<|Af_pffyqavU!`=?uMonbR``YW_S|@KfxqUFf0y&18 ziat8?w^1inv#V(e$d)P<&ma7PBsZK++gasrBhV_49+Bx{@9uieKyOU4(O!O+k?i~v7jC(J3%hZFvJ@1J6C88=DGr%IZJmwT&fWBinAzu*Wmv0h zG1zgBu&}W8esWn!G&KLTz;u3zk*a-YS?I=1eRhDNW9i$ggL+N78LfECNJn248CZd@ zELLtb-=t)M{d(TA_ApC8F?ME{*rkPVZ`Es%9ih%#bKT#=@onwQ@43_ah3Ubh|DRVX@0}q)9!avoB2qy5o5@r zX|hOmty`$mAZndlM(?}>219T_J`ZlfC?a=+vSV0`LGiD0p(=egZ!)($Co5Iv=HVBG z63JZ}i+F6?A5bT1wHm6<3^3@o;Ae2xsKK$jcx<>n~ zqmTF{8G;O>W%XxHD`(0WW`kTm*ES8el4?l*Wx0~#C1S%boD-}}#t}yPAc~4LmC@4| zJBD?-nrIVUTXfBD~HoYca!@;V1+( zp>V9eP^|02MM?gn@2%?v%LuN4hJbaV1X-naFL_xAg>m72d54Iuc)UcPYjAO*5; zj+~LZ8K;nr-Izzc82tfB6SaigJu7Aeb+5Vq=#V_e^j-{)|J~PYTUBVH=G4dZ?o_r5M(r| zTSe=+)uOOJ;cH)VRm^h%PMucUSx2b*3j?7I(jP$1mnj=)+| zhdVaLKUl~twe~e`*B5GJS&Dc`4ZyMUp%f_^0^uppQ?N~R@Rs-KzE4mqbLP4ZWCqF2 zw4VG6Pb8K}Q7Q zgLpM&9h>72N8VcICK*B=w)-+R?q$wwD$mfKSg0QrXtTmThWzAl#=sOL`Qv9w-NEC3 zoXr71qgv@yPq-I3mHxL{yXMzydSSG_@R4e&h8!<~39+q3f~>`{m2-2zEmn9?s}S(p ztd`$3{hj{4245NUuVI}hQ=#T5F!iO|VRy~NkG<4)N`1W)FgkNOE7ygIGdncq5Agv= zEwz^>MU!lg;XyAtb%+3yO7dY+Ew{U-1n%kY^1ufq5A-#6hL3_(+5@=nfNsJT*zjeE z3FW@G@BQPTTuF)D860t^Bb%le{!=m9F6*5Cm}?gUsL4!kN>(Ena(zx_Fr62q%}sJG zWmosb9wM`Ws)^{G+P?*Ao#?6Gyvr)Wp#oh*T+7}7-w;DV%(=X)6ntBiV7Z|CcI0b; zQk=8JYi`19gur0X+doCU0zc(Kz7ZhFcfIIxaAb6IrnG|j2>_Kss?nV$ti?Y|Ybpl6 zTPnyu&)l~x0VbQFvt4}>&MsKXB;4p>&w85aQjNukP7hs(Umu zbz-J!?l<(o^X0%$kWKZWfB{tN1WeHhQ=mnlQ{w_&uR zKC32_8?T`-)uBwOo(kTj0bRM2iKA|~lLI*CX(nv(J#`Zw5l3>#mLdruVW@*L>95=) z8foy<0{qf;a*#6ig@7-KwI@a8JHy%N%V^i$c@tsaF^mjpm*5R9z#2IFIzmy_y3cg;A6%$oI zHHm6(3xEp$N^5=D%7bpLBWVv<2sUFw%S`cw>3ZyYEqVaxHjMv4q7+R@w6_sb0s1>W zKZ-g?WQ$R>(C*t8hK%gi{*Bf;_{vb)LL144pN$Rtyf>aT$7=T2WmzV6+TX&zYzYFb zQ-C}}sUwhF7)aHh4YMjp0gO(qibZ`1PEr%?FC_Z}?{*SwvGFM$&{FSgWSCQYFj9ED zCM4KdBSj=I=}Swv1_1D{Ob3>K%a0jJ_*mgZ4o0zyXU?3^m()Mweh@iTyFzg7mKDg% zQ(tG%s$oWE(-W`OmAwI!O;4< z?J&mje_EZCSO_q^GZ^mL9^jUnLRCJviM3)ww@N;3j5XPwOm*Si$yK zq46MMq@LYVC?Z1A{RBm~_kI+j!fl>*SBE!=Da7Q`-H+z!>3AApWfH#AQuOCr0LLLA z@khxb%vZpvotq4#U`_$a@6HnZJ~mNKTFiuSI%#_!$8qpS36lqGrYVP_8Q;OIZ9mv& zXV_T_86i@@6Tv>ii@)1o9XzsLH+pkIj;M>^b$(OnwPin&?FXN(?i6w{T4{##-^8=) zwBM50@_VQY)FQ=Rm6OS(Na8L6)fD2>zGSHdgh81<|K30EwFfP&{qqC1Xtb_8O5C%@ zB_NFfHuL*zUyj5@!KSoaGcjQXcV7qXC?BkG>=EI#$jKV65B+wbxM0&bZ3`+#gI!Zb zfu152p|HIaeM3sN*z`ZE!0;Y>3NcH^#wTl}1UvGE1?C#Zw*_+2FVfi5(h>~B|FpRe zx&rzU(46^>ddj&^-a;zCzY%f1V|87n?c@OpvUdA38L6{EJNxM))j;O&t$lMF^(C5u zWUl{_%FO*x*tIW{B3fofodXJURk_cuS;ZWdT${%9?4^=Rs*W<>K=)nK z%gdtgd>3a4KEJN#Uqb{ID4sq5!^7zALQT{?` zgM56y=an0~4ztqQ^F9$v=CzA!xLVPXpKrp|!!zTXK!@cjAZLnz$rb`GDas@0yx{z6 zsK07SG72EG%)j3HVJ=01Zp8^+abCyXT5o&`4ohA)pnH`Z$2r7h<|Rh~4s?nkA{2Mc zRz~|nw){TQZ^*nIjfO3Tn#oPc_q*I6s>uiYa17_mGM*M=}iTNtYsGgQp`URA6;%-R~j$u35*TmvqGsXDZTwP$EG`w!25sU0-ix zMPtARO5t~wGj5~~ou4q%60cksuPolb{ckQ)Oog_$33g7fw+jF_`j!=Zu7p5J|vtvD?ldOoX z$DEDt^?WLMOx4JDSR=!9icUsgD69eOcq)^inM+YDn`Q9e zONVv!w?Y=o3W5pY?IvNI7NJYzXO>lJFWn;EcNt)tbONG1bzF#JTvQo@5?b9VKbH;oC!C|A%cX+yPhXO-hdLGOWG|1gd9BlT= zq$x?abyzD~8u7q)2tJa@&PJ2#+V-jW$44=+i)K z=}d14ZjSw3uD10fFMfr(q-;^2yw*V{a@%Y9(6RG7x$uH&73`Kbb35f`+qe^uOb`{Vv-CvpSgiw^KS!$Ou z6@URoQ{;pz`{WMX|L8$`o7#h!$?sJTl$#Bk9&8F|e+PSLZ#g5QgaaFW=~Ub$i@?xE zITdIUy@#Okv0$OJ7h}n{{kGixpiO(g@!9*smnep}9N$=c!>X5lACI!MdG=7*nDq*- zM#Ak8&|3GS6aj=GbdkR$il!LvS-)j*|P;)P+UsAB#{BEtUl8&)8N zg7y_x1?hrBmU1wG{eivS$&=-W_zoC_acOZ6 z4sflpy?e&M4oiQ%#7CfvPTZDQc8yy8#aQVxLM(4xWPX#W^vpLaDDdC(>zfus-U;~C zU_N5#$6Q5H;}a+3ok2FXSao&PohV4pQa>+V&KE=N!P6eIL2l|!(*H=!f^cHf7$rR=EH;>ndAMsJ@K9~pxfA->E6V)TS+xc0DMM%jY(j;|9 zFnUghjEn$daql^x*@C$wIz}e@wM7qngy;3X%~#}pE{AOKYW+_JCH#xywWZxfLqgcw z7HOwuZb}UNPmj%69ct;8^^}Ch8q;Hb8Dag?4GiSYou2dPk!p;*mF6UCx#LZtIXOsf zO!UfFmxl>KrTt;Iu!EQos9XwR5)C(K1@FWj$ylrrq~ERyL;P_b{LA7-NH~t&3P}HR z87&WmYQFg=54;v=;+#e!I`HD$GUBaBFloedx%eF!3*CO-@zhb&ax%53@YlufymJ-U z&WoRP|7x>DqfGF&g&lj*WU_g68oajr-?L`5>DR-)UOEUNPu`y$OiD@rV&$a~<$VX# z=B-~$k!xhZx4CIO`o=?y0KSo8<7lDGK$O{LrDjbUSc(Xug-26qJZKL}n(_N~l6ut; zAsUhU>2pCuKJpu)!#a_O-jbKBt%p%u!C_5exy-k`y0q=nA|v^iv6gd9F&pp`=&@_> zv#Z*25%7?2(ysg8V=bIRw-@E9ux>6Z+|8BC8x9+lH+hpHk^O228S+QffimazM~q2Ht%Bqrko(OYJqqh~MvHSg0g zrwX>z*vI+jQa;MrEirc8PeCj`PVBsN>@czS)sHu=7Kn|rSts78g5yo#PWe4sbLjwT znKYuW#-Mv)Kjl@Np2{B+qs)037PfzDY6@6^J>q{En_Xpvoobm~KT>{EX{lXuFRa8osUQdGK9gVJb# zwZcZQFFw=T#o0kZUjRynH`o)U=df$B~OkZ`g-ahB+DO`GMbePAwHx%%I}StXq9 z^wJq7@DDzClP1c9sP@BE70n1RCYbh>H7jSKhoI8#lNcL*(>HYJ1VbLjuTs&JE&S8Q z&wNC7BW3kUOpi&Dyj0WFs7PVmTT^hG*nNxO#TNs&dcsEpkUDm%j?g1xqzm4U1WNQ9 zs*_IJ2D&uKnjn#((`5*QJ80@FYS1P;q8s(8`{Q#TK48*NpFq>kO2*`dv@q{y4+TE2 zXLGYwL&*Fq6ie|0RGG36$$o=JJ2?(4-36PWgVEsUKbfDQW2ruc#Za;kCF4Xi*>jr% z{o9yN{EFi`jK7F8AiyIJ@N>6Btj6Kc?7p=Mv2#a(p-qPjt|J1O>qP!d!t6b9jrM?7 zCHdZGb-li8!2VLVVDyv}((RoA(#vjpqaSA7}Ydtv>v9Gy&&_D4*sVdFf0Q36b(0H0Q|8Zj=ME zAmx0{|A?HD=;dy-Zwg+Noniz&DkBj0bl&o}vp1pxG5`+wH|$UVE^aWZDE!jlmXXKxYiijVSEA@Q%nQi{L%?B#7Q}OXy6{F|A z0|wkRB5h|@g(aC8{lD_UMSP+XGqD-Jk~faB_kwN@=`m;aICpNWWl>;ADqi$+=h}D^ z6MB3a;MfL#1XZ3YK_5S+n1R+$JIZe*v@WH~X(7kY|^vltC%Ye4ViTg&{Lpd&wRizXi@ zJ)QOdV&2I9PKmMvDV!<_OA>UuJqX8jWz^_&@dldfcIQPXU3WMcpDxc3Q?;!A`BY+X zAO-dRSYa<(1B5}hnY`&~1SK<=|4Q_6rc&z$6}drzllK>>iy6Pd(HP4}j=b!TLMJ7; zL#gLG01!k>VO7%TZu;E$|D4fWbD{$T{Cu;7So*rqd32q0e_H71Ub#xm&=x6YEk=BL z70_&==ibv>exMpR>dSRz_?#Yg-bk@k_A_llgRT)4{ zSgY!<(SFc!_3l}G0?^B!_rlMcegiDm*Ac(4W4n@;heu>dw zj>15`ZR!9wkrdHxjdAn#nZ9J%x)T%_ID+|1t9e~>`J4ni9VPYi+ReCfZuMER&`u86 z+;8&Vd*VFZ{>t1%(Eo$!+db-JZB*9L!^rie0NXSm(vwQu<;ydF==U%OR+T#=yqp5Ypv+{&q54l`L~4=B{~j_@jra&)GZ~ zAo--OFm>qbiL_>8Q9(h^qz&=(z!9f&Tite!7$aqw#gzgf*J4ERvL(v4;SW6J%aNA( zm$98TB{t)%FV2p}_VNlIjv_uxLZGZXagl(fhMXa=BKdrg(()B(q*;!kJ;VVVvk?9A zK5lKL$N8scgY&@4x2e<{r0v%UQbqvfkbZZ%)75-XiTqbH?%0zGBxCY8+mQCixzHh! zewhS?pJ=Vyz!ZENKlSv@$`?q<9y*8@E;)Ufcxw@=xh8oz-K=!_TxDjL(UTkIwv+kB zTAt+7Q>gdWv5xMq^A|22FbjsOw#p({azwJ1NUZ$aJntc$=Sro+UxeL?5o&w85TpTp z{6y?D>{4d*Hw{^^!ZD!A(WTyPYHs+A8LI+1vVP^bkiU*$rVRGVFP`0FN*#}bl>&sQ zRD_J~7TA#rFrZ-g4EPT2?!G*c{5>Hmb)fW!Sq}kd6alS5?thAQRWI5|L>ipjRi7qr z;XV-|qTTsne}HWpI_P$_t~a6g6F`z!Dq3{L{Jyu>0~QK3Jc(=;1kGMd&Y z^Fk&BumDi>cQ7{$#HqP4pCVxI+yVkEBx&9LDmhIO3I><)$&>s|rWPm3TS$h`I_8Yh zEmR;P5}*X<(l5K>PeB!i|GU@{4ZUFp_kQ;HeM~o9gmDPB);^aQT3mPb6FlB~Wpx(=2<4Z<)EVtgkg|I}>d5cxljU>8DLaeya7 z)#dPt#e#19V={&`bAV*<%y^0fP+3Bex{IE`a&NS=!>BF9xQFI+=W= z0vK*6Y~vO@!8O7Er4hX3N`cl_+%(-MMy<86-x`9x!%Z$4fnHyqYp7>3Uy+HfJlWPS z>@gxHTn)QSyGPSs1&Pw*s7JiNgvZ9)wgJp%SRv2Wwh;;iXcIq5+CS1zz|?O$y&PCw zZKcj|^P>-q$E~4Ii#c);OgPCyqYHrKloh2q`_&8Y_*3G*#lrYhNjT>rSQ4nMoZo{8 zMM7Pk5&=RGd->|8W_YO9!)zr}GG(+(!_x0m@xtZrPM=!9Qwd>Wk+J4fpkk(tM9u=z z0BMA1q*V}1X4L89rIU7CjvyHk{F^EsM=1- zYx-_R_@^qBmi;!s!E2W5r9yhvTDkBm8*rH6T^M4$rxF^9;Wkr%mNOlNA6AchYk@1k z=>VZPZEPeIvNf`2H22)n&(DHs*5hEj7ZH7&{86LN=?@?E zMi9(t91s*_`13JT%Vxm`ptWPA4xLqkrNlH{?h01yu`u=FXJMxThl)Tayq92~!Oy+s z>4W`c(1$>o0>mq6-{4aTgXL#5!Dr=A#Fo?7dWyIV4 zzBSX7(?heZL0cpcyaU_Ww3h~|B9+!J{Uwr=d#c#ccsj^BZkFllWoYWMeHZs<+r1k# zA4EHCEG86k8Sr#d7Poe#Y%ubm`_!5e@htav{;;Y@wuBs17caGu6q{ip8Bra93Bflt0L^&y)}m8+ z6rsiQO-Hp;k|i_ZFSkCy@#pOJh`VZaAs)FBaEceHQAhMpJ{DaIO$NYc7pp;4M(W-n zo5ki_&-tSU%!65Z|L)0qi`{o47(kpRgGVk<#TVM!B>7kK{q#9vz`$d~se{;x3Z&{L zcOCxh7dO~G=@9zbx|>RVZ1Rm!-uVBjSy&TNIQ<@V;SP%F!6= zvz#T}tneQbcRrw4Rr-KAaNnFZw0DD5_J{2}cuEIou+(tJbU&p(?$@ga2XOFashrj5 z13sO96^B5o$``jOk&WIZApG=3hY`hSfI4QeG^BTwm07eY3G^?uol)(jos94tKhNy) zOvtNAddz9xbxOB-&(iogf}k@XQ)RX3rM((gLb$BSzXFi|@}KNFr6)0_WqeM)cj)ei zeEAP||M$~CxDY*s63Tkir&*%$|E0Qs!hpq>d0b8XlZ1g+)$GT*Lm~lT%q_sY>`!bn z(Nw+H!-7!_s596Vx7QDS&Mx6um0Y9eD^T4M+y-OEOxbmt%I83g=De0y`|c;IV|5_G zzqaI?4_0A`ftcjP&{#T6osQ|*A#v5T|L5}JMI|o&*& zgJ6qMto=k-A@j@5bR)X+pU{hN!|1qv#b)N!e;{Cwot*;`q4`p+|GiWy&iH5UWk*RQ zr}%lTLx#l?;fw4Nu60hWcFZLb-Za-@MmO*5eIs8uAEHm<{~R%qpMM%|3Ia3;pLRGx ze-`X|749qzp+L@YJMM!{vUgT(C)=*t4_{y^=1LYASuxct_)M76hTt?p zuyUXse=!)3H0Ajx-4mCO_7BxY+q`u8aS`@9MtAhv18^FQ_yIdc%j>SlGmn*~aaJa# z86i-tit%O0*x^_&v7yR#guwD*jdni{DRJXq?$sUOEG;Y;*detSQMczt`Z?y6S&DJ? z&vTUHcw6(>VS2Cisxth5wCBImUiW^VMEdSO0zYWj5+a3n)wJbWZanIo7Jn41ejBmI zCt}<3IDfM;ISZiAD||5dU`p_^xxuxOEc8q!Yq=xfN+~NaxJNGAmr?|AxmJ1|ZTl!^ z%ei3n5evg8rX+shYxOHi+zH^BNYLu0E4-c;b$B$KM#igP>TiCkh(bO`{WF4%w@t$I zmExHNEawXiPRC{MAXjmsamjWs@U5LFlb3W9K?1Zipgj}`;Ux>hOU$Rf4|TeQG}_?; z$=Ih0WE-XV(o0cq$jt5uY~<+lCt(cv#EMT}K&!6#%pc;#>NELosa>Yf>Go_cT%Q`I zP_RH7cT2=p$)ug8PuG=hEpMQX@W_Hjfq~oYrfXzeLh#R-s`h8&y`EgLZP!}6dtp>A zyPm^-AA4|{d`yhGgg_rfMPkB`KpV;npViVd^E#29r>|}9O>+wb^W$TUE+2c76OEz{ z>pt$hJU6IFXV{$e#HnOW%D(Y;w(H+af^JFWxqpr|&uvJMH!3;(IFrJ6F+}D1fkGFz z_st^kzblsP#-E`;tBNT40c%Y(M?{TI4!edeZBsEA)6bJCD=eYOHu7wS{#g}pZ?EG2 z_B>sJWz*7v3Pjt!h1gwE0Tj5Cc{ih7<;|jR6}iCNCAX6XxuzQ%2iGPSW8-bqBW>1! z{i~hdzkY~%aTl6EJv=Vu%U$lB%7vsRM{g^!IoLk{;NKwE%NHcrqVbIa?_#Of!gDxn z>DIhcAH%f}`lYm^{en^$OmV{*Q^>F@c6Jff2&% zzGixSAG*|+s3Je4P>QBbNc}Mook7eGDIl*cXC+0f0%O{^4tviR*9Cvm6`M#z+`tLs z;Bx-Ev|kl?GJ2Gn7b;7LSeEY(_MBVo&!}-0yd4wE(H)`l^JCCeMTQ;~2C{UNzLN@B zY3rR@Y76juwNjS6{GnjA9JNtZ+y%++&2?4z<4hNYG{?bh?pO!2oRliDsZtdtGm-ZP zqHrak9$_1Dv%5w0aH;NR=A$PCLMY|sBD(cB5m5I*Fo2+k&B)&kt6~30KKaO=RxY*| zYM$y~BzJs*LJ2#xi=2#t#dnkHnP@=AnxxY_m7@5v@UGq?FDp6NP_#W?>g}PxLtB<= zJHAx?OS{YG5x_YDb*Ceb#E1-&$H@n)l-%G)y{8e_u5=_PX8+xY^$vEkquSYgUS#~F zBHQ(XZRq5mOuBq^q;*@JpVNwvSVZ*2Z#B@uJsVg#KxZXzLBR_|Gfh!tvm%5%O(ok&BbE;7yH(MD7h_`7j zcq0{JD$mC4ERq*8*_Nj(=tL-tCcozGjZVPB{W4-|%$F0yWD~-tq9N|DsjS z6UZCQ9IiX#c4cxb@|E#)gvn}mCz^YesF(i<93&&i)cSBVm$OHDXst+JnM^iVF7SQP z2I_WQ0{_w17S>@1=_1DUx0)vg-rmLawXt%N(fxHiLAoh(9Q08d-&FuQW<3Fdi>6fc zuQi!oyGs61!7`#c){_%*EnfzO7RLz~S(dAi6bQy?nM;Gnq1SP6nXpBrccZb+J&T2V zn|qch%-U~d4!_aAk&En_n+?>7ze@gnBaByIB z&nf(%+KV?{m6A|Le{_%pIo=eq2VC+b6vqim%fqQ0ppAjaLa7ZFYJ_fe&TC=GN5Gzp zO|6Ewb~XZ&wQl631vAt}ddNWYVugHcsj%;UOL!e8$&|VY)qq>Aej>WwIQT>^eXBdY zBmb}Fs3sxTpjP3U5G~`$>3nOtbMO8_q^m9s!3NOS=cd=2?m+yng>D@a0hD2gggPft z3sF?wK(R6QJxjwb2({{>_P@mc9`vgp6P+Z+TSzd{wOy9Z_Uvb`&GL9D|GWjAKl8f| zCg{)VaPNSF-WWb;8$>(qMgL>Bk0JY-P?KZ$hkIrLHs7d*{la!at^>`QZawJE%}uqt zDem36T(69R@zd8p4*RTefYa};hVxvot^BVq@ci1Mf!35j>C4PGXI00-%IKV zJH|c((BFHF>>=sVWskS2>o4ZmKJEyPO{<2JUk8~!a5quy-C*wcB6z4_2<&kOL9&}e z%;O-ypI8Z7@)vZ%3-=_*b~-L0=Lqx6Rz&-g$=l~!2adc6qfa9|lh%E)+PiEB=sXuY znGjR9qoFW=M+u(=U~)mv0boT_iyu-pcUvlAVcq0Xkd>?a+FRBslowq;c$~t&>`A4< z#tGXjVDuBQLd%H6JI9W^#v7Gh59D?o+CS~>6)(Hh*@|PFN@D^96k^h3&UD2Lo;(DU zK25mg-j1f5ML?=~r0PFx{W3mCj2#U$$nkNzc7sMYV+S@afo5IZ1D^p8>mwQUVrNQy zgG{mcS5TNiOQ!1^hctE_AD{naNw-+y%YJg4^^YCaCic_C|}v=n*URkqw9Br1RvW z&f>d*4qT)~Cf+6HwulZm?ImEJ<&P1S?!WNzT-nE)*kdeQwh0c52obOmEd_sH^>h?; zpm}=fvVuF;w{%2C<7h>NO-hEFMRg!^$!e<2BKY_yYasVdxxI5a24Q%j8e6?Yx3mo} z1hU`Grss;F`~PS>`H@0Y2g=B>E&ynq--H{Em99e&>%C;#M{df{-@6I-nDKPNsI&@4 zCSdhrNdr0L*YkoOjAl6w^)|nSy-pQBDPq!8b!|PLz>tB6@;7Iy??LXwD$2iLcQuZa zH*Ti(1ZTuFAE-eVvi&WMU4`2Nn&0Do4D42v&$>=)w+B@JD4BRT8>!LBFg$Tqq>%um zvag|wK`oZkgXeB(;iFx^ju?b)jnhQ82Y7v#gx1&{V+Tj70nUwV@k+H08zeMDdaL$7 zDSM^Az**CXH3&~JX}Pr(@P{I7w;;N!HZ%}Zu=ZUR#lS!f6m;1BE;SLMxy2qZs7nWl zcvFY`Y+9=dHMxT4GSWK7-_8jbzBd>ZrpufpTj?SkM7IG>`5}O0AqO!sgf{am4Jf@# zK<{4EggEr6D*(ReFjCywF&$vD0xXo^&z}XnBhlhex!vc4CU$x~>b@TaUMhUkqBIL$ z7b1M?;t?KJQvFsnjtH0N#QO(lY<{_tcUqL|g_a-)T_p=9AL`sAU8t%#Fv!@M-XUu1 zI#&$6KlofKI=Rvn+C9SI0X4K36_!rPp4{`cMw zZ@l;MeR&ystTB=?viI6+%{kYszgex~QW8}9;M4bF&K>a_DY(&}wOs43!F4ZmM>~z& zRG0m=7$_Icf_q$_aqs=`z4832Sq&b)NeT&btLnQ5+LOyjm6vh3dWuvrd*717&2A`q zmE0u|%iB`vATXH^M81v z6Srj->JE;2c5A&{SL5!BclqDv0I((|s8AFgH)nXktpfjSCRScVkmP{oO|6V-031dI z!u8>Wie3`_9AB(tAp`&8`93oDeH-=)P<_7kn-Osd(gC!Iy5set1LZ2b9P~K1;LWgg zSqf)nb&P_Af^007bE z2%%f@RH0V%Xx-LN0h64x=2NZ8As?TfL^kvuD0$TkDr&l@QH09j$49@spEOyqd-vN} z6Z%Y;K|F>Mwnxu642!?MtfjpHoD{FT%RsZY;b-x>tnFuINRBS0+a`+!Tr-zUC_b}Y zQ*b@~LI@17=eSce#s~xD( zTbJ;3$!Re7B~0X|ut@CJrqvDqt5p~45@!-{&Lpi1vJ>>66KOa*M1T5#Vq74|S zsTzFCue4*`VHcgEn#z_jVF#1kGy}zAx74^l6^V-ysdmUhF%ID{*Cy{s;p#f`$$##8 zK9)#BwxbYlSxp|&fNY9VgaKc>!t8_ueLS4vCtOA8izU5LwMu1O}T&t4QAGu6(h`DQBtLIUu zfcNL|OUc3R-iC)OPZ6GXo1g(Eix8RK1bTx0b#ef~#UrdjLO%ZXxN< zs*KLMy=2f?pLe&EcR|Vsgl9Hkr+w6Wz@=%;cG^P&8bEM4u3fEtK<+6JVMB>Y2w5r_ z_w%mylyR4tTrm{38b5x7z{iFho~uaJ{7$1~%DiX(Mt%MdrnOr|0OQ+gCvbgqHxKO{ z$o9$SbQw)|N?S}MIW4YRe0@ueF_%Dca(y&5stS;`r>N|`e$Rr|tHag)0;bVqvF1!_ zcfSC8>DcHEHZElU)A(RBiCbl71W}c6`oJE}b#Ko16Oi{YfUQV>sZ~559P`Oe)YvO| zbT z6`kv@xRRu`tDk5VZ%c*=2gtCH$Jp>)7X5i@KI337{IYjE~l)*z&BiuIYw^d#;!xxXO*w~kvnPh z!z4E&sDB53`us1e>?%o1ckjQLGb~$rnSNca{gp(?ssh5U;HKVaIfh8DVsnkeQ1$t4u$;4Hxm|ykV1dOatB6%57U1lFEqpDSM!qG_E;@*q}a4 zeT}hcH7(Lr!K5Kcw4ERx4!^M&5wDaf;m{OSq5Y-zc8 z3qYQ|Ds8!HM%HXaGvGlr2RS^l>5qEbx6t z1fsO;YKCtTxfO1>_73Wh-CI*$2x}iQj+HW%P{(i`%m7FfOyC@+eXk`OUK^WMmBYWb z^KTl4{-%N^2yI^>11hycC3pFfL2a&vdgG|$$`z$Oi|>~ikX_0l!S-XZXM*L8M!hZl z)mZdX+e&75D^rsH1MIKyntlSGdP4sNcl%Z=s?>&xbgOr5sU76KUpMP~0PykKLdR>f zfnPLws|Y+mBMY4b=4{_X=T@Zf6^OaVGo8I7;ws+3u2K#zTB8fG+nh#u>%O@(ramZ40T-T+O83 zz&Ed>YGrYl8oRuNfM$+*XG6I-@!FHCpA&Q{(vjv8w8HD!mun}Lp{?ifv&H9|)KkZA zJN@-i{IrTjDlNgQrxz#V$iDZYYKZd)eS>vqs9egq&p=yYS*HW2xc|mK{{k-hIoT&j z*^I5`Y3fp3ZW-`lao40Pr5goPDBtD-f^c$u}p*eqi z8KQ=qlOHh-xxL$pRB)fZ3_0Q_uEps#;R);SEQ0T3essS-b911+Ek$zW^k%9O2M~fE zj|cO7di3f_KB+zbDoxV!7D#9|FPE1;lCCQDaw9WmOq6H*hLzU(7B3#c)Tm>h9^q{)(w>52o=g)kV>ixco-T{uHG<^P>o2PN+fF*k+BJ21nOSY<5}fU#Gr{ zTUriz6UIExZlshv6f2Xy-s0r`{`s%JC8m#N+!sk(PNh27r$dSkvW0A6vRJl~FG)`8 zT0Q-!*Annvnrd$YafNX=ns@+hb=l2kRxyU915>)sb}Zps##xwA#9VxZ8bsmkSxn$$}R z0j6t{EH>$ag~~3=0CP9BtOG&qt4@Qpt@dJ&Bt60f;nc0B@|6~!=gV_SshCkw$j@3@ z%n!hTW!IUvwz3b)w-PhY3RP?34SO+sAAr(l2hv@E&a2=WBR-g9JPNS!SPu6LN1g64 zAniT^!g(19SYiTQEH;er^!23;BVrN@RO>8E;SVmBEU5#Pi34Fuu5&p zZ}2kq%vx$*?mObUkZYANna8nzi5fCro{%Z?N-IH|b3Um7u$8_?guk()d4Vc*+)g}_ z%tI>vdGc8Ao;-EZlSqvkwm>(Bc2USP;}n*{lU||2D^CC{nD$i_CxW@|ozUNVZ}WqJ z^IU%+rt>iJ&bM#XXo|V^@xyBiXik&V;uX>>OSeCc}I4p!At2Cmn zOox#snEd4{ncn%~S*V8D0;nZv`6O_}3DdOv7e4x?Rz25qMofyhuoq4c@m6VyPbm?i zg#j{@VH;Q;ANE1GurOwQDm@fezKS=14q`+f?FUEef~x8h0CQ7s#3 zWus{Iu=I4xy%2I$zWH#-&Zw8{Mm<$^d3RGh>EUSdTZODm#Zh!fBV4BTx1g;hX2PM(85?qT z&yAzkUV>$KZ?zt1C6>d3hjAcNyEY=@_Wqg={>NI=M-5tUh-Y|*`d{jS*um`|6golU zsZv^1h9&J-cqiIcT_N$X#~?+{25BaSSe2+;ALI?TCyQ}+bfBE?U-UHFWe6$&$QZZp z6W;S7U?;m9U$*{)RxkD;VH1;u#0c@Qx^uC=hUQT z`B{CwZ|MjXmH~cibY`P~_w$$N+A~)_eh}8+=OiPvIlA(ZPCY+j>N{)*R98bea&Ywl zR=%f;a>yn{^eQf_OI#X2Fzr2T5SqT2A0ez||ExDl7+4%vF^K5v<=Nk@RQCp)_Jsle zsSrB*0elibqW(<)0e=}t08g54aR?jYFq!qdsQ~IJj7Si{WQfY?>_m-(QmN>DT960u zavDACzpnbMaeqHn<(>~m7h3r+Tkr@jY$9w^RO*H1slAqy5z9t@Fq)u;J{b8a;6a)+ zzPe09#VU~Fp2(>%F$nA@um48X`>0EmZnVY7eN`aIEM#rT2eIsRMwfg49&|AD&5z?# z%e2e|(%RX&DL+r3@|r2n@OY|wX()5tg|%VgX!@Hz&kLt*EUGTg69|oaUvLD3(2kC9 zU*T!pe>f6BKu>US#9}Plj~wldp0H|0NUv}ipDaJ*Kn*&^B|xRpwV^ZFvAM~(0T=l_ zCZKmpeJl=e_|*zs*slp;a$*$&%Cg$Y0}cBlnKXvlfFSlPJKmg*+U`jaTd*J`&dqBt z%z8EiF$DjBZq~0V@D)I_$ZEm?p_6^bL=eV5qtVeXTkQK|wa=&@^8+oXQ(yK_{lh_# z#6>#4NFRyBmGulB8t5mpnn3BN10Qbx_@=^cFJd7NIujvZ-sw;Iy%gWR@8#|KY~&&a zJLbIV6(n&U}|)OvG+o_hV=YhFj-TtHmGrCj@=iR^lvcU3GVWOZ0W9*{v*Sxe5@CTRAH^mJsUqSvT6=MCt11|`n{iUl?Xi_jzrOMd_d&GO3^HVqw=dtW~1AAWG{oC3>XJ0M=Zw{d^h>=!#bN3eu zOB`hv_deXF|1E_kWWOL~Mp%R0g-2iQUDM82D!Os?qTRDKeEvu%Fx8d_m8S$G_T8_p ztpRF8r0+jwbM1XyU+f!4@ov8o@wkf2tF7GggWQb4Lazpb#AyV3k*{j1*|Ms9X8h@O zNWVD?+r(ezbIZ2*(R>by=!!#d#5S(7iZ8DjiQ6@Q=>Qk!;6u^@;WF8w-(=(rI6EL9+(aN<5PkW zk&_KvZYWRh)7E2IOBS4UvxR@%9ndV1ou8!yP)>_6|7gd5j}XBYLC6LTfOl@&B?=L* z32ks%`#0zR+^VtYomS)fV#@c`w6cDpewqaYU9wDE+b>0O9~)iJ#fJe9u?q6@1;?^f zcb=TBBC7O>C8OTfb^kgAa_3pXKAVzMc4f<*&bhJPK4YQx=6z?D z68`7)`59hswoRTv(U+ahzyYTuq7skZ{W z4l3}rWtZbZ-QH6xZIB!kNDOasDS}DuqDH+lR`Ho$)|QmMA6kLR&ZgDcUF&`*)W5DZ zi11(n$|*+2!WHUR7b0aKd0aEP`xd?aX}9gT;Ou|QFEQe3w>bCD-4E`;8#kh()-LzVchU&Hudwj+noJ63_}PxLHOzTC;PUgyOCH%c^#DtpRl*^20SM zEGBJLx%H~z28-2tV-&qSz{^MubE!grW&@{B?bCZ>5|=(1H5vBtFa|-p8h z65AY|{38;O^w6!KLF1r*TicH=Y`u=(~8!f>s_lQH3WTmrEN;I&_vpUlo zm*kxb0{I}!s^CBi3CQ2l>2H?gxGo}4jwS(^KXZ#&D}y|AAkDTqvcveKdb7h-UDiBH ztgce^>@=sni0&EkWmp2PBSKE5Rm{7zQ%7j|QDe^+cY1P-u@3&qbmoe5e*mC}b5qUy zI0VuJn>&Ncas6%rqNJMPa@qq9+rec^Tffgdx7s@0KYmwCLAtm}3F_ZrVeF;JEY$}m zU$}WKPfF!LT@!Ufid|e&|DATV^yM4eHXrle?%xYR_EuN#{&o58@D=v_?)?%* z;S$^yi^*e5=W^qyjl1x!nbv?vDdO})wFU9kZ*T-EeI%yO+Ne`~5l4&P-quKQuU$>Z zUuHIO(~O#9`aeYJz}BcbV`lFY7?#La}wp{W}qF}z5Y&zKka z=MQ%tDQqOU^q)ER;-{0%PK;dnJ#kzWyg#RwO;3WvNhemWIF1Sn&u6inW+x4#pr@`# zPn;^oLJ?n!-VgA0RdvC=TdquQv~yMU#NB-9z+prM@#LQklfS`zqKsiQo%U?576udk z{O4hI_2kq?_h@M|l9@y;rJasjOcg)Y-QRbhq6MFny>!T=p+%-Jk!99Naz|0BoT0{2 zoN>d5!hNheZXQmI;gu1YRkjF_vWE!(Q(v`RazO}BzWn0X_pLdtesC(-?BuB{HZ{AF znW2jLaD1t2%wyV0Rp_(8)yvjD2|4g%dN`LB&j?zqGWXXMGhuhZti>67pdghcp@teL z5AUrzK(`T15V`MPB6|vV3SwiO>)sM) z6lHY^G1%6X|2b~IvpLW&fl2z4)rw+uuGMqgCXtwS;^*7fnSvm8F1K9@WQth`by!rX z4by2j=&f)copP>lzdgOjJnd!fpUm_)baG0E(CpQNdPvyC9^i6DrW^>(9NLyVgcmJ3lZfjD9!Gf^h zUh(y+H~;Na@mUj0Fd;!bbD_OV`0zC@V4%~hf|uQ7Z}@D%hY4_3Wz#%xf=Kq+zC?Fq zt-peWI2LXR3`89?K;l01Y5jr@Sf{d&mm;`5+uf6*N=41{IJblK}-_q>`ay?*!YEC{_BJ1Rj^V zrzV^#_BUr9^!~78he$@n_$cD;Q&3b}{750g_WJwG9tUs5;Z>d)o(uCEk~3R+Lo)Tw zbX7= zkyep&^)1Uf{}Z29PE9G8{xrG;GMMw)mw$5lK15IxAIc?`uDC ziV-y&6r&O=4R~eFwsbxDck;=U*y)`+gjMvAlRCAw*}7l^*mHv)MINIRbEs~mbEi*v zqeYW_L@8s=5&o-BxFp^fp)K|OPKWnbD+PBF*~9y&QZq}68M$Yx|PSEpiF#tC(%UVF}! zUlQ;~o+j@V{HcAI&jtMn!*3+*{OO#B!UI>c5g^ z7)Xj95iUu*bd3cPUhv*_!wz9c2y7S3MeQKE1(~~rpICRjpQg-oR?`p5ZTr;&@KHaw z3!Dai2QGs!Hax7Z1jK8m7-~745wx9|Nln}DkvG`Z+jz%5|DHaEDP}E-75Ti~AMZ+` zwEe0eUFA=s=$QY;H0!m;gM5S7Ew0Iit2%r<-PhsKF=%{^$nFtc_O(UcO_# z%T=6&j%LpE4ttIiv@vM0Ds)?}$&&u!H7HpxMk?&{S>Mjj2U*TAwUut^oIbU)p;Wf$ zb21vfaXCc$w zG4#Smd!wx?mLQo6M>Eh)8%?yjluCEgJSq(z$<&ek;UJ3TCBEx;>M<2m86z#jx!uxdZd&3H`bxN<*~&ECDS_~^zIr~3Ic{whj-f{Hlbv%ABAu*C17BQfW{ z{%@{_8|<~Y17z3}a<;j|J)BeZJ0A8xhd%64t?X>)h;vs~iYNaHqX zs_WPtPWVv2lyMo9bXrIM_cKgSeO#;F-7xQnulVKB4C7TQs(bW+!!q1sbn5O%%=JZ` znoal%7uUnA8`)90?@#}{n7=5Rf{DAYt^Z&r3=Mqa`!$6i%BrRXn!1i>R-~U#RKfus z8O7h*%+X!=kQXvjTX*vzN9C(DI*`ZY)Xfsc4A_scQ2vahxw-wCO%w`mrs&fA`1}oT zlvj>T8JY%Lx`rv6Gz@zTPxT5X-gjhaRdht5(g^Fh2m2)r^}$w$m1CjPQw?S(En_#P zrx?u?YdufCj`58*yyuRJy5Df{r~)px>!bP{=M3k~Jv$^dw-(jG3G2#cW|xkeZZ`N; zwLx-SamAg%ZLf*bH@vX}DG&!oWC`Ek)Uq)0uzYNGW{QoVkT3!_J9%LBs3t724V0Ry zly0T9x&Fv@b#Cqs0#3+6ZxLMp35GK}v35nKOg7EmrlBAsocbe9!Ya^!Mu2MjEYWKj zhQAJT0-Rk>dNly32iHPZo zUj}R$N;DLOXIr66Xk6C{)u~trWZ2->lEv6sTW!va>pwOq$4y`j zTfOe93%&`o2>Uvd6t%9&PAnsw06dSMBat)^CoLNG`VUL1o2%?{LpixFo43_*XyuFm zjExD*%XBzf4QBAGub2z(*lR*yL>g`FCAazEzh3m9f`xQ2^0`!kRUWv)zkfK>RPoP; z{F?G1xT+^C2<67Bv4*JIBO^MpBXfP!S&572j`s`MK?bh5?(T`C9G?ej?Z++m7D4W_|v*z}I9y!m=WFFpbNipF<_jRiM6g)yD3L-IR8W(0PXc9pG>^w;QOBnP0@ z$cYkn)0Ut`-|qeXz{eR%4|kyr#$M7C`V}P=!&c&q=NKylc0m_d>S1kpamV~S zb-rK2;C2-W+4odcpsB2*3?R|jvgakQ?2FQgg_0edkjIw|vLoIyfVdt1rC zt#yV3R|<+qP8H_JC@-rm_NGSybWq9NkYT!KjeeVe2bnH407NPJeKHnqZ3W!S%PLi_ zOFT~iYF(^7Ddz1*_Dl~dX*-!kSeJt4ZLN7qnnjZA+3!+=?^Dw%<^472*!B6*;W!m- zNIeU?lm=x#cL9=bVk%dMWbrpOJ_jq002}>wXdtcUNnmkZD(7+KoVB3$87Da@)RjMF z2FdNf7e!fYFEjjb1yy=NE7b6zJsYs*7%UTh%ObG7%TQAD59_MTWGMdd?Qf8F4;MAR zS6}BeVf_N2&K-!7t#Y(Q5$!cYm_Rv$RtuGg6Oe~Q>8nD?icm{fd)u~B{LmV3kWt-) z2y4k;?@09pJpk;Q*9Z^7`=Omwtx%egxx5aF$hR+h+#D(BKy+dZP#gY6tpcC}Ip&uF;}ORm@X~gYofV4x*T@=fDFr{H9q3yl1u=o3 zsWUkS%R6Dhhhgq0^4we+1uFhielkE&va?%8bBWEJQ8#o}OAveZzc-!B$)e;Ax@}JG z!4#7(-QR5z!PXu_0!)L)*3!c&LIQ%o24^}xk~C2DZ}PN%6%K6pGlwwdWD{t$jyNwQ zJTz~Mj)zZH{3_Jiz6}NsX+eryw(j{BCty7*BNX64M$QrgrK2Ca3WYwechQHn9|JWY zAS!iFTvhK}_?a_tvaUW1j^YxYS`2gXEC(niZ=mT8g`g(z~;WJE8`-e8- zd|57N>vzHjlO7wihDfukwt8HVlE^JsW+f_A?e4^8VRYZ^d>z6WM&lrb{-YeLKgtEyc+$GO9&qmahgL2819>MCgOt z4}Vr%JB1$11r zOVZ|}WE9E+VYn=oAeM4GeOjS>67(31_rC@v>z0V1G+a3jPHNtgD_{1MbU})I4k#=w zp9|LyNN{ZPnOX*aDbd=bJo%#Ft=Xc|Vh#)M6hdZ^7V$I9Qa7Pw4YEJ$quEaH!`f?x z8v2c(x212{M4$Gf&7O(AIJP{<4gWxhaG&%n#v7h`Kish>IdES$uTE02iS3HRuqafg zI1{O8*}&WV;r`m2Ti+1fO7lOI4pS%V+ND?(9?xBxaC-m?kFwj5FR{9|IR9?G3fh)Vj|4d`6+Zzb>YtNTml*zp$49{qShdV~!TmN*S@Veg7_C~uj0 zp|$A7$nZ5VGZaHgIy|XLE$eZsZ=ZtoyD4r>(8*4u2ChSkuf0I|P*3>H*6bd~JX7k7 zj0_r|SEd7H+G4zJj>?&J?T%}Fbd7lFl7n4q9cUT2ojQpV-=jWJ!}fRjxv4Ub-wLz) zxZgK_MirLvO6PpKUm`^n-jXnA*;?)|JDh^cfpq;3GG{;O|Ep_q&?kw?EM=`O$(G&W zd1)q=;)!zv=8X(}tePx#E|>rR_5XMUZm69x`(_YnqtUT$WWeXHmY!y%x^?9L0=5a5 AUjP6A diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst index a3d18bed77..825fafb0b6 100644 --- a/docs/finn/internals.rst +++ b/docs/finn/internals.rst @@ -27,8 +27,6 @@ Custom Operations/Nodes FINN uses many custom operations (op_type in ONNX NodeProto) that are not defined in the ONNX operator schema. These custom nodes are marked with domain="finn.*" or domain="qonnx.*" in the protobuf to identify them as such. These nodes can represent specific operations that we need for low-bit networks, or operations that are specific to a particular hardware backend. To get more familiar with custom operations and how they are created, please take a look in the Jupyter notebook about CustomOps (see chapter :ref:`tutorials` for details) or directly in the module :py:mod:`finn.custom_op`. -.. note:: See the description of `this PR `_ for more on how the operator wrapper library is organized. - Custom ONNX Execution Flow ========================== @@ -137,7 +135,7 @@ ModelWrapper contains more useful functions, if you are interested please have a Analysis Pass ============= -An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis` . +An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis`. .. _transformation_pass: @@ -148,26 +146,26 @@ A transformation passes changes (transforms) the given model, it gets the model .. _mem_mode: -MatrixVectorActivation *mem_mode* -================================== +HLS variant of MatrixVectorActivation: *mem_mode* +================================================= FINN supports three types of the so-called *mem_mode* attrıbute for the node MatrixVectorActivation. This mode controls how the weight values are accessed during the execution. That means the mode setting has direct influence on the resulting circuit. Currently three settings for the *mem_mode* are supported in FINN: -* "const" +* "internal_embedded" (former "const" mode) -* "decoupled" +* "internal_decoupled" (former "decoupled" mode) * "external" -The following picture shows the idea behind the "const" and "decoupled" mode. +The following picture shows the idea behind the "internal_embedded" and "internal_decoupled" mode. .. image:: img/mem_mode.png :scale: 55% :align: center -Const mode ----------- -In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `Matrix_Vector_Activate_Batch function `_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these. +Internal_embedded mode +------------------------ +In *internal_embedded* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *internal_embedded* mode IP block generation the `Matrix_Vector_Activate_Batch function `_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these. Advantages: @@ -175,17 +173,15 @@ Advantages: * easier to debug layer in cppsim since no additional components -* well-tested and mature components - Disadvantages: * can lead to very long HLS synthesis times for certain weight array shapes * less control over the weight memory FPGA primitives, Vivado HLS doesn't always make the best resource allocation decisions -Decoupled mode --------------- -In *decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU `_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib `_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *const* mode. +Internal_decoupled mode +------------------------ +In *internal_decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU `_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib `_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *internal_embedded* mode. Advantages: @@ -197,14 +193,12 @@ Advantages: Disadvantages: -* somewhat less well-tested compared to the const mode - -* higher resource footprint due to additional weight streamer and weight FIFO +* slightly higher resource footprint due to additional weight streamer and weight FIFO How to set *mem_mode* --------------------- -When the nodes in the network are converted to HLS layers, the *mem_mode* can be passed. More detailed information about the transformations that prepare the network and the transformation that performs the conversion to HLS layers can be found in chapter :ref:`nw_prep`. The *mem_mode* is passed as argument. Note that if no argument is passed, the default is *const*. +When the nodes in the network are specialized to HLS layers, the *mem_mode* can be passed. More detailed information about the transformations that prepare the network and the transformation that performs the specialization to HLS layers can be found in chapter :ref:`nw_prep`. The *mem_mode* is set in the node attributes of the nodes and can be passed as part of the folding configuration. The default is *internal_decoupled*. .. _folding_factors: @@ -217,46 +211,43 @@ Constraints to folding factors per layer * - **Layers** - **Parameters** - **Constraints** - * - Addstreams_Batch + * - Addstreams - PE - inp_channels % PE == 0 - * - ChannelwiseOp_Batch + * - ChannelwiseOp - PE - channels % PE == 0 * - ConvolutionInputGenerator - SIMD - inp_channels % SIMD == 0 - * - ConvolutionInputGenerator1d - - SIMD - - inp_channels % SIMD == 0 * - Downsampler - SIMD - inp_channels % SIMD == 0 - * - DuplicateStreams_Batch + * - DuplicateStreams - PE - channels % PE == 0 - * - Eltwise + * - StreamingEltwise - PE - inp_channels % PE == 0 - * - FMPadding_batch + * - FMPadding - SIMD - inp_channels % SIMD == 0 - * - FMPadding_rtl + * - FMPadding_Pixel - SIMD - inp_channels % SIMD == 0 - * - Globalaccpool_Batch + * - Globalaccpool - PE - channels % PE == 0 - * - Labelselect_Batch + * - Labelselect - PE - num_labels % PE == 0 * - MatrixVectorActivation - PE & SIMD - MH % PE == 0 & MW % SIMD == 0 - * - Pool_Batch + * - Pool - PE - inp_channels % PE == 0 - * - Thresholding_Batch + * - Thresholding - PE - MH % PE == 0 * - VectorVectorActivation @@ -280,9 +271,6 @@ This RTL version is an alternative to the original `HLS implementation Date: Mon, 25 Mar 2024 11:37:51 +0000 Subject: [PATCH 278/291] [transform]: remove resType selection of VVAU --- src/finn/transformation/fpgadataflow/convert_to_hw_layers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index e2f638ed62..897d714bf8 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1644,7 +1644,6 @@ def apply(self, model): [mt_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="lut", PE=pe, Dim=[mm_in_shape[1], mm_in_shape[2]], Channels=channels, @@ -1673,7 +1672,6 @@ def apply(self, model): [mm_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - resType="lut", PE=pe, Dim=[mm_in_shape[1], mm_in_shape[2]], Channels=channels, From e8ae3c44eaf91d93ee298c00e71d68b4fdaa645b Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Mon, 25 Mar 2024 11:38:07 +0000 Subject: [PATCH 279/291] [tests]: renamed VectorVectorActivation to VVAU --- tests/fpgadataflow/test_depthwise_convolution.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py index bde5e918e3..24bc2f3afe 100644 --- a/tests/fpgadataflow/test_depthwise_convolution.py +++ b/tests/fpgadataflow/test_depthwise_convolution.py @@ -190,7 +190,7 @@ def test_depthwise_conv_hw_cppsim(act, pe, k, stride, padding): if n.op_type.startswith("ConvolutionInputGenerator"): convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", pe) - elif n.op_type.startswith("VectorVectorActivation"): + elif n.op_type.startswith("VVAU"): vvau_node = getCustomOp(n) vvau_node.set_nodeattr("PE", pe) new_model = new_model.transform(SetExecMode("cppsim")) @@ -235,7 +235,7 @@ def test_depthwise_conv_hw_rtlsim(act, pe, k, stride, padding): if n.op_type.startswith("ConvolutionInputGenerator"): convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", pe) - elif n.op_type.startswith("VectorVectorActivation"): + elif n.op_type.startswith("VVAU"): vvau_node = getCustomOp(n) vvau_node.set_nodeattr("PE", pe) From e057fc9a121fefbe2410f255e4fff08ea39bff44 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 25 Mar 2024 11:48:02 +0000 Subject: [PATCH 280/291] [Docs] Update top level markdown files Signed-off-by: auphelia --- CHANGELOG.rst | 10 --------- CONTRIBUTING.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++- LICENSE.txt | 3 ++- README.md | 7 +++---- 4 files changed, 58 insertions(+), 16 deletions(-) delete mode 100644 CHANGELOG.rst diff --git a/CHANGELOG.rst b/CHANGELOG.rst deleted file mode 100644 index 226e6f5931..0000000000 --- a/CHANGELOG.rst +++ /dev/null @@ -1,10 +0,0 @@ -========= -Changelog -========= - -Version 0.1 -=========== - -- Feature A added -- FIX: nasty bug #1729 fixed -- add your changes here! diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d376a1b42b..53a505fb41 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -29,6 +29,58 @@ Please follow the steps below and be sure that your contribution complies with o 1. The main branch should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break. 2. For new features, smaller bug fixes, doc updates, and many other fixes, users should pull request against the development branch. -3. We will review your contribution and, if any additional fixes or modifications are +3. ### 3. Sign Your Work + +Please use the *Signed-off-by* line at the end of your patch which indicates that you accept the Developer Certificate of Origin (DCO) defined by https://developercertificate.org/ reproduced below:: + +``` + Developer Certificate of Origin + Version 1.1 + + Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + 1 Letterman Drive + Suite D4700 + San Francisco, CA, 94129 + + Everyone is permitted to copy and distribute verbatim copies of this + license document, but changing it is not allowed. + + + Developer's Certificate of Origin 1.1 + + By making a contribution to this project, I certify that: + + (a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + + (b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + + (c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + + (d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. +``` + +Here is an example Signed-off-by line which indicates that the contributor accepts DCO:: + +``` + This is my commit message + + Signed-off-by: Jane Doe +``` + +4. We will review your contribution and, if any additional fixes or modifications are necessary, may provide feedback to guide you. When accepted, your pull request will be merged to the repository. If you have more questions please contact us. diff --git a/LICENSE.txt b/LICENSE.txt index 278564a5a4..cec78d6043 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,5 @@ -Copyright (c) 2020, Xilinx +Copyright (C) 2020-2022, Xilinx, Inc. +Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index 2e1faf8f0c..0856701908 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,12 @@ -drawing +drawing [![GitHub Discussions](https://img.shields.io/badge/discussions-join-green)](https://github.com/Xilinx/finn/discussions) [![ReadTheDocs](https://readthedocs.org/projects/finn/badge/?version=latest&style=plastic)](http://finn.readthedocs.io/) -FINN is an experimental framework from Xilinx Research Labs to explore deep neural network -inference on FPGAs. +FINN is an experimental framework from Integrated Communications and AI Lab of AMD Research & Advanced Development to explore deep neural network inference on FPGAs. It specifically targets quantized neural networks, with emphasis on generating dataflow-style architectures customized for each network. @@ -28,7 +27,7 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s ## Documentation -You can view the documentation on [readthedocs](https://finn.readthedocs.io) or build them locally using `python setup.py doc` from inside the Docker container. Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/main/notebooks), which we recommend running from inside Docker for a better experience. +You can view the documentation on [readthedocs](https://finn.readthedocs.io). Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/main/notebooks), which we recommend running from inside Docker for a better experience. ## Community From 7b138408166ce574a85ea1b4e62655262a36fe88 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 25 Mar 2024 11:50:18 +0000 Subject: [PATCH 281/291] [Docs] Fix typo in CONTRIBUTING markdown Signed-off-by: auphelia --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 53a505fb41..3f4529c400 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -29,7 +29,7 @@ Please follow the steps below and be sure that your contribution complies with o 1. The main branch should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break. 2. For new features, smaller bug fixes, doc updates, and many other fixes, users should pull request against the development branch. -3. ### 3. Sign Your Work +3. Sign Your Work Please use the *Signed-off-by* line at the end of your patch which indicates that you accept the Developer Certificate of Origin (DCO) defined by https://developercertificate.org/ reproduced below:: From d4fbd21e1320e05a03ffe58a2cb35e5ce3b6c954 Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 25 Mar 2024 12:02:02 +0000 Subject: [PATCH 282/291] [Docs] Update AUTHORS md Signed-off-by: auphelia --- AUTHORS.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/AUTHORS.rst b/AUTHORS.rst index 861b81924b..5a11497fc8 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -28,3 +28,9 @@ Contributors * Matthias Gehre (@mgehre-amd) * Hugo Le Blevec (@hleblevec) * Patrick Geel (@patrickgeel) +* John Monks (@jmonks-amd) +* Tim Paine (@timkpaine) +* Linus Jungemann (@LinusJungemann) +* Shashwat Khandelwal (@shashwat1198) +* Ian Colbert (@i-colbert) +* Rachit Garg (@rstar900) From 6d036947dea5ad7a75e1bbdda644017ae3b59c9a Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 25 Mar 2024 15:33:55 +0000 Subject: [PATCH 283/291] [Tests/Docs] Set SWG to HLS for depthwise conv cppsim tests --- CONTRIBUTING.md | 4 +++- tests/fpgadataflow/test_depthwise_convolution.py | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3f4529c400..5e34624790 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -73,7 +73,9 @@ Please use the *Signed-off-by* line at the end of your patch which indicates tha this project or the open source license(s) involved. ``` -Here is an example Signed-off-by line which indicates that the contributor accepts DCO:: +You can enable Signed-off-by automatically by adding the `-s` flag to the `git commit` command. + +Here is an example Signed-off-by line which indicates that the contributor accepts DCO: ``` This is my commit message diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py index 24bc2f3afe..a45f253530 100644 --- a/tests/fpgadataflow/test_depthwise_convolution.py +++ b/tests/fpgadataflow/test_depthwise_convolution.py @@ -182,6 +182,12 @@ def test_depthwise_conv_hw_cppsim(act, pe, k, stride, padding): new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVectorVectorActivation()) + # CPPsim of RTL SWG defaults to Im2Col emulation which has no concept + # of parallelism. So, we're using the HLS-SWG for cppsim testing for now. + # Set preferred_impl_style to hls to instantiate HLS-SWG + swg_nodes = new_model.get_nodes_by_op_type("ConvolutionInputGenerator")[0] + getCustomOp(swg_nodes).set_nodeattr("preferred_impl_style", "hls") + new_model = new_model.transform(SpecializeLayers()) # set SIMD in ConvInputGen node and PE in VVAU node From 00e6e51ac70522afa7f15a08d2536c328ff9746f Mon Sep 17 00:00:00 2001 From: auphelia Date: Mon, 25 Mar 2024 17:30:20 +0000 Subject: [PATCH 284/291] [Deps] Update dockerfile with new copyright header --- docker/Dockerfile.finn | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 9d7ca809db..2ceb1f4195 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -1,4 +1,5 @@ -# Copyright (c) 2021, Xilinx +# Copyright (C) 2021-2022, Xilinx, Inc. +# Copyright (C) 2022-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,7 +28,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FROM ubuntu:jammy-20230126 -LABEL maintainer="Yaman Umuroglu " +LABEL maintainer="Jakoba Petri-Koenig , Yaman Umuroglu " ARG XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt" From 1e97e9721b30b16f5436fb0fce4d1df215dc7574 Mon Sep 17 00:00:00 2001 From: johnnoel Date: Tue, 26 Mar 2024 10:05:06 +0000 Subject: [PATCH 285/291] [Tests] Force HLS components for special case cnv-2-2 on u250 and pynq-z1 --- tests/end2end/test_end2end_bnn_pynq.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index a25d7e6725..5b295655df 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -104,6 +104,7 @@ get_trained_network_and_ishape, load_test_checkpoint_or_skip, ) +from finn.util.fpgadataflow import is_fpgadataflow_node build_dir = os.environ["FINN_BUILD_DIR"] target_clk_ns = 20 @@ -598,6 +599,12 @@ def test_specialize_layers(self, topology, wbits, abits, board): prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers") model = load_test_checkpoint_or_skip(prev_chkpt_name) # set preferred impl style to hls for all layers + force_hls_boards = ["Pynq-Z1", "U250"] + if topology == "cnv" and wbits == 2 and abits == 2 and board in force_hls_boards: + for node in model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model.save(get_checkpoint_name(topology, wbits, abits, "specialize_layers")) @@ -628,9 +635,19 @@ def test_specialize_layers(self, topology, wbits, abits, board): ("StreamingMaxPool_hls", 2), ("LabelSelect_hls", 1), ], + "cnv-2-2": [ + ("Transpose", 1), + ("Thresholding_hls", 1), + ("ConvolutionInputGenerator_hls", 6), + ("MVAU_hls", 9), + ("StreamingMaxPool_hls", 2), + ("LabelSelect_hls", 1), + ], } if topology == "tfc" and wbits == 1 and abits == 1: exp_key = "tfc-1-1" + elif topology == "cnv" and wbits == 2 and abits == 2 and board in force_hls_boards: + exp_key = "cnv-2-2" else: exp_key = topology exp_layer_counts = exp_layer_counts[exp_key] From aa361f5d8a1b083538da3dcb8f07f47879823588 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 26 Mar 2024 11:49:47 +0000 Subject: [PATCH 286/291] [rtl swg]: interleave channels for CPPsim --- .../rtl/convolutioninputgenerator_rtl.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py index 68760227d7..cb4ce1e884 100755 --- a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py @@ -33,6 +33,7 @@ from qonnx.core.datatype import DataType from qonnx.custom_op.general import im2col from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp from qonnx.util.basic import roundup_to_integer_multiple from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( @@ -290,6 +291,19 @@ def execute_node(self, context, graph): if mode == "cppsim": ConvolutionInputGenerator.execute_node(self, context, graph) + # Interleave channels such that cppsim of ConvolutionInputGenerator_rtl + # has a notion of SIMD parallelism. Subsequent VVAU_{hls/rtl} expects + # the channels to be interleaved (i.e. to match their PE parallelism). + node = self.onnx_node + im2col_out = context[node.output[0]] + simd = getCustomOp(node).get_nodeattr("SIMD") + ofm_h, ofm_w = getCustomOp(node).get_nodeattr("OFMDim") + k_h, k_w = getCustomOp(node).get_nodeattr("ConvKernelDim") + ifm_ch = getCustomOp(node).get_nodeattr("IFMChannels") + im2col_out = im2col_out.reshape(1, ofm_h, ofm_w, k_h * k_w, ifm_ch // simd, simd) + im2col_out = im2col_out.transpose(0, 1, 2, 4, 3, 5) + im2col_out = im2col_out.reshape(1, ofm_h, ofm_w, ifm_ch * k_h * k_w) + context[node.output[0]] = im2col_out elif mode == "rtlsim": node = self.onnx_node exp_ishape = self.get_normal_input_shape() From 87d11c6c3659ca060274eb76f49bdddc54119208 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 26 Mar 2024 11:50:40 +0000 Subject: [PATCH 287/291] [vvau]: RTL-swg in cppsim now interleaves channels -- updated 'pe' selection --- .../custom_op/fpgadataflow/vectorvectoractivation.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index 7f1bf72964..ef80b24a2e 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -122,15 +122,12 @@ def execute_node(self, context, graph): (k_h, k_w) = self.get_nodeattr("Kernel") channels = self.get_nodeattr("Channels") producer = [x for x in graph.node if x.output[0] == node.input[0]] - exec_mode = self.get_nodeattr("exec_mode") - if ( - not bool(producer) - or producer[0].op_type == "ConvolutionInputGenerator_hls" - or (producer[0].op_type == "ConvolutionInputGenerator_rtl" and exec_mode == "rtlsim") + if bool(producer) and ( + producer[0].op_type == "Im2Col" or producer[0].op_type == "ConvolutionInputGenerator" ): - pe = self.get_nodeattr("PE") - else: pe = channels + else: + pe = self.get_nodeattr("PE") # Reorder the input activations. Note that PE gets interleaved by the SWG, # so we have to untangle and for simplicity of computation assume pe=1. From 42852dfe87899969974bc47f897d66e77d2829d9 Mon Sep 17 00:00:00 2001 From: mmrahorovic Date: Tue, 26 Mar 2024 13:56:34 +0000 Subject: [PATCH 288/291] [tests]: remove defaulting SWG to HLS --- tests/fpgadataflow/test_depthwise_convolution.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py index a45f253530..b8242df933 100644 --- a/tests/fpgadataflow/test_depthwise_convolution.py +++ b/tests/fpgadataflow/test_depthwise_convolution.py @@ -182,16 +182,9 @@ def test_depthwise_conv_hw_cppsim(act, pe, k, stride, padding): new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVectorVectorActivation()) - # CPPsim of RTL SWG defaults to Im2Col emulation which has no concept - # of parallelism. So, we're using the HLS-SWG for cppsim testing for now. - # Set preferred_impl_style to hls to instantiate HLS-SWG - swg_nodes = new_model.get_nodes_by_op_type("ConvolutionInputGenerator")[0] - getCustomOp(swg_nodes).set_nodeattr("preferred_impl_style", "hls") - new_model = new_model.transform(SpecializeLayers()) # set SIMD in ConvInputGen node and PE in VVAU node - for n in new_model.graph.node: if n.op_type.startswith("ConvolutionInputGenerator"): convinputgen_node = getCustomOp(n) @@ -236,7 +229,6 @@ def test_depthwise_conv_hw_rtlsim(act, pe, k, stride, padding): new_model = new_model.transform(SpecializeLayers()) # set SIMD in ConvInputGen node and PE in VVAU node - for n in new_model.graph.node: if n.op_type.startswith("ConvolutionInputGenerator"): convinputgen_node = getCustomOp(n) From 86e28e4765347f5ccd0bfcbde675f5b631a3f95f Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 26 Mar 2024 15:39:13 +0000 Subject: [PATCH 289/291] [Tests] Enable interleaving of output for dw only --- .../rtl/convolutioninputgenerator_rtl.py | 24 ++++++++++--------- .../test_fpgadataflow_convinputgenerator.py | 5 +--- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py index cb4ce1e884..321522e7ba 100755 --- a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py @@ -291,19 +291,21 @@ def execute_node(self, context, graph): if mode == "cppsim": ConvolutionInputGenerator.execute_node(self, context, graph) - # Interleave channels such that cppsim of ConvolutionInputGenerator_rtl + # if depthwise = 1 + # interleave channels such that cppsim of ConvolutionInputGenerator_rtl # has a notion of SIMD parallelism. Subsequent VVAU_{hls/rtl} expects # the channels to be interleaved (i.e. to match their PE parallelism). - node = self.onnx_node - im2col_out = context[node.output[0]] - simd = getCustomOp(node).get_nodeattr("SIMD") - ofm_h, ofm_w = getCustomOp(node).get_nodeattr("OFMDim") - k_h, k_w = getCustomOp(node).get_nodeattr("ConvKernelDim") - ifm_ch = getCustomOp(node).get_nodeattr("IFMChannels") - im2col_out = im2col_out.reshape(1, ofm_h, ofm_w, k_h * k_w, ifm_ch // simd, simd) - im2col_out = im2col_out.transpose(0, 1, 2, 4, 3, 5) - im2col_out = im2col_out.reshape(1, ofm_h, ofm_w, ifm_ch * k_h * k_w) - context[node.output[0]] = im2col_out + if self.get_nodeattr("depthwise"): + node = self.onnx_node + im2col_out = context[node.output[0]] + simd = getCustomOp(node).get_nodeattr("SIMD") + ofm_h, ofm_w = getCustomOp(node).get_nodeattr("OFMDim") + k_h, k_w = getCustomOp(node).get_nodeattr("ConvKernelDim") + ifm_ch = getCustomOp(node).get_nodeattr("IFMChannels") + im2col_out = im2col_out.reshape(1, ofm_h, ofm_w, k_h * k_w, ifm_ch // simd, simd) + im2col_out = im2col_out.transpose(0, 1, 2, 4, 3, 5) + im2col_out = im2col_out.reshape(1, ofm_h, ofm_w, ifm_ch * k_h * k_w) + context[node.output[0]] = im2col_out elif mode == "rtlsim": node = self.onnx_node exp_ishape = self.get_normal_input_shape() diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 02aaf85851..45ca74fbea 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -217,10 +217,7 @@ def test_fpgadataflow_slidingwindow( # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] - # if cppsim and impl style rtl is selected, the node execution is done by the hw op parent - # so, no reordering/shaping of the output is needed - # because there is no concept of SIMD parallelism in the hw abstraction layer execution - if dw == 0 or (optype == "ConvolutionInputGenerator_rtl" and exec_mode == "cppsim"): + if dw == 0: assert (y_produced == y_expected).all() else: y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd) From 84654a34170819c035f021590970fa82b49973bb Mon Sep 17 00:00:00 2001 From: auphelia Date: Tue, 26 Mar 2024 17:24:24 +0000 Subject: [PATCH 290/291] Fix linting --- tests/end2end/test_end2end_bnn_pynq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 5b295655df..94134967fa 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -95,6 +95,7 @@ MoveScalarLinearPastInvariants, ) from finn.util.basic import get_finn_root, make_build_dir, test_board_map +from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.pytorch import ToTensor from finn.util.test import ( execute_parent, @@ -104,7 +105,6 @@ get_trained_network_and_ishape, load_test_checkpoint_or_skip, ) -from finn.util.fpgadataflow import is_fpgadataflow_node build_dir = os.environ["FINN_BUILD_DIR"] target_clk_ns = 20 From 9507e234d2a458c847ce8e43cac47df4bbe36192 Mon Sep 17 00:00:00 2001 From: aziz bahri Date: Wed, 27 Mar 2024 14:44:10 +0000 Subject: [PATCH 291/291] [Thresholding RTL] extract RAM trigger to json config --- src/finn/builder/build_dataflow_steps.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 7508981485..443d2df54c 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -432,6 +432,8 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi "resType", "mem_mode", "runtime_writeable_weights", + "depth_trigger_uram", + "depth_trigger_bram", ] extract_model_config_to_json(model, cfg.output_dir + "/auto_folding_config.json", hw_attrs) @@ -607,6 +609,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): "runtime_writeable_weights", "inFIFODepths", "outFIFODepths", + "depth_trigger_uram", + "depth_trigger_bram", ] extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)